xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 09573ac72a50d3e7ecd55a2b7f0ef28450cd0a8b)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
34e2b4712SSatish Balay /*
44e2b4712SSatish Balay     Factorization code for BAIJ format.
54e2b4712SSatish Balay */
64e2b4712SSatish Balay 
77c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
8c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
916a2bf60SHong Zhang #include "petscbt.h"
1016a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
114e2b4712SSatish Balay 
124a2ae208SSatish Balay #undef __FUNCT__
1393fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
1493fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1593fd935bSShri Abhyankar {
1693fd935bSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1793fd935bSShri Abhyankar   PetscErrorCode    ierr;
1893fd935bSShri Abhyankar   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
1993fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
2093fd935bSShri Abhyankar   PetscInt          nz;
2193fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
2293fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
2393fd935bSShri Abhyankar   const PetscScalar *b;
2493fd935bSShri Abhyankar 
2593fd935bSShri Abhyankar   PetscFunctionBegin;
263649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2793fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2893fd935bSShri Abhyankar   tmp  = a->solve_work;
2993fd935bSShri Abhyankar 
3093fd935bSShri Abhyankar 
3193fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
3293fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[i];
3393fd935bSShri Abhyankar 
3493fd935bSShri Abhyankar   /* forward solve the U^T */
3593fd935bSShri Abhyankar   for (i=0; i<n; i++) {
3693fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
3793fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
3893fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
3993fd935bSShri Abhyankar     s1  = tmp[i];
4093fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
4193fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
4293fd935bSShri Abhyankar     tmp[i] = s1;
4393fd935bSShri Abhyankar   }
4493fd935bSShri Abhyankar 
4593fd935bSShri Abhyankar   /* backward solve the L^T */
4693fd935bSShri Abhyankar   for (i=n-1; i>=0; i--){
4793fd935bSShri Abhyankar     v   = aa + ai[i];
4893fd935bSShri Abhyankar     vi  = aj + ai[i];
4993fd935bSShri Abhyankar     nz  = ai[i+1] - ai[i];
5093fd935bSShri Abhyankar     s1  = tmp[i];
5193fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
5293fd935bSShri Abhyankar   }
5393fd935bSShri Abhyankar 
5493fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
5593fd935bSShri Abhyankar   for (i=0; i<n; i++) x[i] = tmp[i];
5693fd935bSShri Abhyankar 
573649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5893fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5993fd935bSShri Abhyankar 
6093fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
6193fd935bSShri Abhyankar   PetscFunctionReturn(0);
6293fd935bSShri Abhyankar }
6393fd935bSShri Abhyankar 
6493fd935bSShri Abhyankar #undef __FUNCT__
6506e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
6606e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
67f1af5d2fSBarry Smith {
68f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
69dfbe8321SBarry Smith   PetscErrorCode    ierr;
700b68f018SBarry Smith   PetscInt          i,nz;
710b68f018SBarry Smith   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
720b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
730b68f018SBarry Smith   PetscScalar       s1,*x;
740b68f018SBarry Smith   const PetscScalar *b;
75f1af5d2fSBarry Smith 
76f1af5d2fSBarry Smith   PetscFunctionBegin;
77ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
783649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
80f1af5d2fSBarry Smith 
81f1af5d2fSBarry Smith   /* forward solve the U^T */
82f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
83f1af5d2fSBarry Smith 
84f1af5d2fSBarry Smith     v     = aa + diag[i];
85f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
86ef66eb69SBarry Smith     s1    = (*v++)*x[i];
87f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
88f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
89f1af5d2fSBarry Smith     while (nz--) {
90f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
91f1af5d2fSBarry Smith     }
92f1af5d2fSBarry Smith     x[i]   = s1;
93f1af5d2fSBarry Smith   }
94f1af5d2fSBarry Smith   /* backward solve the L^T */
95f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
96f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
97f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
98f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
99f1af5d2fSBarry Smith     s1   = x[i];
100f1af5d2fSBarry Smith     while (nz--) {
101f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
102f1af5d2fSBarry Smith     }
103f1af5d2fSBarry Smith   }
1043649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
106dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
107f1af5d2fSBarry Smith   PetscFunctionReturn(0);
108f1af5d2fSBarry Smith }
109f1af5d2fSBarry Smith 
1104a2ae208SSatish Balay #undef __FUNCT__
11106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
11206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
113f1af5d2fSBarry Smith {
114f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
115dfbe8321SBarry Smith   PetscErrorCode    ierr;
116b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
117b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
118b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
119b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
120b3260449SShri Abhyankar   const PetscScalar *b;
121f1af5d2fSBarry Smith 
122f1af5d2fSBarry Smith   PetscFunctionBegin;
123ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1243649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126f1af5d2fSBarry Smith 
127f1af5d2fSBarry Smith   /* forward solve the U^T */
128f1af5d2fSBarry Smith   idx = 0;
129f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
130f1af5d2fSBarry Smith 
131f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
132f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
133ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
134f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
135f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
136f1af5d2fSBarry Smith     v += 4;
137f1af5d2fSBarry Smith 
138f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
139f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
140f1af5d2fSBarry Smith     while (nz--) {
141f1af5d2fSBarry Smith       oidx = 2*(*vi++);
142f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
143f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
144f1af5d2fSBarry Smith       v  += 4;
145f1af5d2fSBarry Smith     }
146f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
147f1af5d2fSBarry Smith     idx += 2;
148f1af5d2fSBarry Smith   }
149f1af5d2fSBarry Smith   /* backward solve the L^T */
150f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
151f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
152f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
153f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
154f1af5d2fSBarry Smith     idt  = 2*i;
155f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
156f1af5d2fSBarry Smith     while (nz--) {
157f1af5d2fSBarry Smith       idx   = 2*(*vi--);
158f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
159f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
160f1af5d2fSBarry Smith       v -= 4;
161f1af5d2fSBarry Smith     }
162f1af5d2fSBarry Smith   }
1633649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1641ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
165dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
166f1af5d2fSBarry Smith   PetscFunctionReturn(0);
167f1af5d2fSBarry Smith }
168f1af5d2fSBarry Smith 
1694a2ae208SSatish Balay #undef __FUNCT__
1704dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
1714dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1726929473cSShri Abhyankar {
1736929473cSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1746929473cSShri Abhyankar   PetscErrorCode    ierr;
175b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1766929473cSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
177b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
178b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
179b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
180b3260449SShri Abhyankar   const PetscScalar *b;
1816929473cSShri Abhyankar 
1826929473cSShri Abhyankar   PetscFunctionBegin;
1836929473cSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1843649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1856929473cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1866929473cSShri Abhyankar 
1876929473cSShri Abhyankar   /* forward solve the U^T */
1886929473cSShri Abhyankar   idx = 0;
1896929473cSShri Abhyankar   for (i=0; i<n; i++) {
1906929473cSShri Abhyankar     v     = aa + bs2*diag[i];
1916929473cSShri Abhyankar     /* multiply by the inverse of the block diagonal */
1926929473cSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];
1936929473cSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
1946929473cSShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
1956929473cSShri Abhyankar     v -= bs2;
1966929473cSShri Abhyankar 
1976929473cSShri Abhyankar     vi    = aj + diag[i] - 1;
1986929473cSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
1996929473cSShri Abhyankar     for(j=0;j>-nz;j--){
2006929473cSShri Abhyankar       oidx = bs*vi[j];
2016929473cSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2;
2026929473cSShri Abhyankar       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
2036929473cSShri Abhyankar       v  -= bs2;
2046929473cSShri Abhyankar     }
2056929473cSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;
2066929473cSShri Abhyankar     idx += bs;
2076929473cSShri Abhyankar   }
2086929473cSShri Abhyankar   /* backward solve the L^T */
2096929473cSShri Abhyankar   for (i=n-1; i>=0; i--){
2106929473cSShri Abhyankar     v    = aa + bs2*ai[i];
2116929473cSShri Abhyankar     vi   = aj + ai[i];
2126929473cSShri Abhyankar     nz   = ai[i+1] - ai[i];
2136929473cSShri Abhyankar     idt  = bs*i;
2146929473cSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];
2156929473cSShri Abhyankar     for(j=0;j<nz;j++){
2166929473cSShri Abhyankar       idx   = bs*vi[j];
2176929473cSShri Abhyankar       x[idx]   -=  v[0]*s1 +  v[1]*s2;
2186929473cSShri Abhyankar       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
2196929473cSShri Abhyankar       v += bs2;
2206929473cSShri Abhyankar     }
2216929473cSShri Abhyankar   }
2223649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2236929473cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2246929473cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2256929473cSShri Abhyankar   PetscFunctionReturn(0);
2266929473cSShri Abhyankar }
2276929473cSShri Abhyankar 
2286929473cSShri Abhyankar #undef __FUNCT__
22906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
23006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
231f1af5d2fSBarry Smith {
232f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
233dfbe8321SBarry Smith   PetscErrorCode    ierr;
234b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
235b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
236b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
237b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
238b3260449SShri Abhyankar   const PetscScalar *b;
239f1af5d2fSBarry Smith 
240f1af5d2fSBarry Smith   PetscFunctionBegin;
241ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2423649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
244f1af5d2fSBarry Smith 
245f1af5d2fSBarry Smith   /* forward solve the U^T */
246f1af5d2fSBarry Smith   idx = 0;
247f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
248f1af5d2fSBarry Smith 
249f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
250f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
251ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
252f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
253f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
254f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
255f1af5d2fSBarry Smith     v += 9;
256f1af5d2fSBarry Smith 
257f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
258f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
259f1af5d2fSBarry Smith     while (nz--) {
260f1af5d2fSBarry Smith       oidx = 3*(*vi++);
261f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
262f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
263f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
264f1af5d2fSBarry Smith       v  += 9;
265f1af5d2fSBarry Smith     }
266f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
267f1af5d2fSBarry Smith     idx += 3;
268f1af5d2fSBarry Smith   }
269f1af5d2fSBarry Smith   /* backward solve the L^T */
270f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
271f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
272f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
273f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
274f1af5d2fSBarry Smith     idt  = 3*i;
275f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
276f1af5d2fSBarry Smith     while (nz--) {
277f1af5d2fSBarry Smith       idx   = 3*(*vi--);
278f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
279f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
280f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
281f1af5d2fSBarry Smith       v -= 9;
282f1af5d2fSBarry Smith     }
283f1af5d2fSBarry Smith   }
2843649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2851ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
286dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
287f1af5d2fSBarry Smith   PetscFunctionReturn(0);
288f1af5d2fSBarry Smith }
289f1af5d2fSBarry Smith 
2904a2ae208SSatish Balay #undef __FUNCT__
2914dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
2924dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2938499736aSShri Abhyankar {
2948499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2958499736aSShri Abhyankar   PetscErrorCode    ierr;
296b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2978499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
298b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
299b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
300b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
301b3260449SShri Abhyankar   const PetscScalar *b;
3028499736aSShri Abhyankar 
3038499736aSShri Abhyankar   PetscFunctionBegin;
3048499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3053649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3068499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3078499736aSShri Abhyankar 
3088499736aSShri Abhyankar   /* forward solve the U^T */
3098499736aSShri Abhyankar   idx = 0;
3108499736aSShri Abhyankar   for (i=0; i<n; i++) {
3118499736aSShri Abhyankar     v     = aa + bs2*diag[i];
3128499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
3138499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
3148499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
3158499736aSShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
3168499736aSShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
3178499736aSShri Abhyankar     v -= bs2;
3188499736aSShri Abhyankar 
3198499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
3208499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
3218499736aSShri Abhyankar     for(j=0;j>-nz;j--){
3228499736aSShri Abhyankar       oidx = bs*vi[j];
3238499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3248499736aSShri Abhyankar       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3258499736aSShri Abhyankar       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3268499736aSShri Abhyankar       v  -= bs2;
3278499736aSShri Abhyankar     }
3288499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
3298499736aSShri Abhyankar     idx += bs;
3308499736aSShri Abhyankar   }
3318499736aSShri Abhyankar   /* backward solve the L^T */
3328499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
3338499736aSShri Abhyankar     v    = aa + bs2*ai[i];
3348499736aSShri Abhyankar     vi   = aj + ai[i];
3358499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
3368499736aSShri Abhyankar     idt  = bs*i;
3378499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
3388499736aSShri Abhyankar     for(j=0;j<nz;j++){
3398499736aSShri Abhyankar       idx   = bs*vi[j];
3408499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3418499736aSShri Abhyankar       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3428499736aSShri Abhyankar       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3438499736aSShri Abhyankar       v += bs2;
3448499736aSShri Abhyankar     }
3458499736aSShri Abhyankar   }
3463649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3478499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3488499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3498499736aSShri Abhyankar   PetscFunctionReturn(0);
3508499736aSShri Abhyankar }
3518499736aSShri Abhyankar 
3528499736aSShri Abhyankar #undef __FUNCT__
35306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
35406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
355f1af5d2fSBarry Smith {
356f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
357dfbe8321SBarry Smith   PetscErrorCode    ierr;
358b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
359b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
360b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
361b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
362b3260449SShri Abhyankar   const PetscScalar *b;
363f1af5d2fSBarry Smith 
364f1af5d2fSBarry Smith   PetscFunctionBegin;
365ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3663649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3671ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
368f1af5d2fSBarry Smith 
369f1af5d2fSBarry Smith   /* forward solve the U^T */
370f1af5d2fSBarry Smith   idx = 0;
371f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
372f1af5d2fSBarry Smith 
373f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
374f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
375ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
376f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
377f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
378f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
379f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
380f1af5d2fSBarry Smith     v += 16;
381f1af5d2fSBarry Smith 
382f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
383f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
384f1af5d2fSBarry Smith     while (nz--) {
385f1af5d2fSBarry Smith       oidx = 4*(*vi++);
386f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390f1af5d2fSBarry Smith       v  += 16;
391f1af5d2fSBarry Smith     }
392f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
393f1af5d2fSBarry Smith     idx += 4;
394f1af5d2fSBarry Smith   }
395f1af5d2fSBarry Smith   /* backward solve the L^T */
396f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
397f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
398f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
399f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
400f1af5d2fSBarry Smith     idt  = 4*i;
401f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
402f1af5d2fSBarry Smith     while (nz--) {
403f1af5d2fSBarry Smith       idx   = 4*(*vi--);
404f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
405f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
406f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
407f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
408f1af5d2fSBarry Smith       v -= 16;
409f1af5d2fSBarry Smith     }
410f1af5d2fSBarry Smith   }
4113649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
413dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
414f1af5d2fSBarry Smith   PetscFunctionReturn(0);
415f1af5d2fSBarry Smith }
416f1af5d2fSBarry Smith 
4174a2ae208SSatish Balay #undef __FUNCT__
4184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
4194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4208499736aSShri Abhyankar {
4218499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4228499736aSShri Abhyankar   PetscErrorCode    ierr;
423b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
4248499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
425b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
426b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
427b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
428b3260449SShri Abhyankar   const PetscScalar *b;
4298499736aSShri Abhyankar 
4308499736aSShri Abhyankar   PetscFunctionBegin;
4318499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4323649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4338499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4348499736aSShri Abhyankar 
4358499736aSShri Abhyankar   /* forward solve the U^T */
4368499736aSShri Abhyankar   idx = 0;
4378499736aSShri Abhyankar   for (i=0; i<n; i++) {
4388499736aSShri Abhyankar     v     = aa + bs2*diag[i];
4398499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
4408499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
4418499736aSShri Abhyankar     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
4428499736aSShri Abhyankar     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
4438499736aSShri Abhyankar     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
4448499736aSShri Abhyankar     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
4458499736aSShri Abhyankar     v -= bs2;
4468499736aSShri Abhyankar 
4478499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
4488499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
4498499736aSShri Abhyankar     for(j=0;j>-nz;j--){
4508499736aSShri Abhyankar       oidx = bs*vi[j];
4518499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4528499736aSShri Abhyankar       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4538499736aSShri Abhyankar       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4548499736aSShri Abhyankar       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4558499736aSShri Abhyankar       v  -= bs2;
4568499736aSShri Abhyankar     }
4578499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
4588499736aSShri Abhyankar     idx += bs;
4598499736aSShri Abhyankar   }
4608499736aSShri Abhyankar   /* backward solve the L^T */
4618499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
4628499736aSShri Abhyankar     v    = aa + bs2*ai[i];
4638499736aSShri Abhyankar     vi   = aj + ai[i];
4648499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
4658499736aSShri Abhyankar     idt  = bs*i;
4668499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
4678499736aSShri Abhyankar     for(j=0;j<nz;j++){
4688499736aSShri Abhyankar       idx   = bs*vi[j];
4698499736aSShri Abhyankar       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4708499736aSShri Abhyankar       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4718499736aSShri Abhyankar       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4728499736aSShri Abhyankar       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4738499736aSShri Abhyankar       v += bs2;
4748499736aSShri Abhyankar     }
4758499736aSShri Abhyankar   }
4763649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4778499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4788499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4798499736aSShri Abhyankar   PetscFunctionReturn(0);
4808499736aSShri Abhyankar }
4818499736aSShri Abhyankar 
4828499736aSShri Abhyankar #undef __FUNCT__
48306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
48406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
485f1af5d2fSBarry Smith {
486f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
487dfbe8321SBarry Smith   PetscErrorCode    ierr;
488b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
489b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
490b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
491b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
492b3260449SShri Abhyankar   const PetscScalar *b;
493f1af5d2fSBarry Smith 
494f1af5d2fSBarry Smith   PetscFunctionBegin;
495ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4963649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4971ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
498f1af5d2fSBarry Smith 
499f1af5d2fSBarry Smith   /* forward solve the U^T */
500f1af5d2fSBarry Smith   idx = 0;
501f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
502f1af5d2fSBarry Smith 
503f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
504f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
505ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
506f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
507f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
508f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
509f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
510f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
511f1af5d2fSBarry Smith     v += 25;
512f1af5d2fSBarry Smith 
513f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
514f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
515f1af5d2fSBarry Smith     while (nz--) {
516f1af5d2fSBarry Smith       oidx = 5*(*vi++);
517f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
518f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
519f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
520f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
521f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
522f1af5d2fSBarry Smith       v  += 25;
523f1af5d2fSBarry Smith     }
524f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
525f1af5d2fSBarry Smith     idx += 5;
526f1af5d2fSBarry Smith   }
527f1af5d2fSBarry Smith   /* backward solve the L^T */
528f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
529f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
530f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
531f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
532f1af5d2fSBarry Smith     idt  = 5*i;
533f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
534f1af5d2fSBarry Smith     while (nz--) {
535f1af5d2fSBarry Smith       idx   = 5*(*vi--);
536f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
537f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
538f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
539f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
540f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
541f1af5d2fSBarry Smith       v -= 25;
542f1af5d2fSBarry Smith     }
543f1af5d2fSBarry Smith   }
5443649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5451ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
546dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
547f1af5d2fSBarry Smith   PetscFunctionReturn(0);
548f1af5d2fSBarry Smith }
549f1af5d2fSBarry Smith 
5504a2ae208SSatish Balay #undef __FUNCT__
5514dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
5524dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
5538499736aSShri Abhyankar {
5548499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
5558499736aSShri Abhyankar   PetscErrorCode ierr;
556b3260449SShri Abhyankar   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5578499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
558b3260449SShri Abhyankar   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
559b3260449SShri Abhyankar   const MatScalar      *aa=a->a,*v;
560b3260449SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
561b3260449SShri Abhyankar   const PetscScalar    *b;
5628499736aSShri Abhyankar 
5638499736aSShri Abhyankar   PetscFunctionBegin;
5648499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
5653649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5668499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5678499736aSShri Abhyankar 
5688499736aSShri Abhyankar   /* forward solve the U^T */
5698499736aSShri Abhyankar   idx = 0;
5708499736aSShri Abhyankar   for (i=0; i<n; i++) {
5718499736aSShri Abhyankar     v     = aa + bs2*diag[i];
5728499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
5738499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
5748499736aSShri Abhyankar     x5 = x[4+idx];
5758499736aSShri Abhyankar     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
5768499736aSShri Abhyankar     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
5778499736aSShri Abhyankar     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
5788499736aSShri Abhyankar     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
5798499736aSShri Abhyankar     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
5808499736aSShri Abhyankar     v -= bs2;
5818499736aSShri Abhyankar 
5828499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
5838499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
5848499736aSShri Abhyankar     for(j=0;j>-nz;j--){
5858499736aSShri Abhyankar       oidx = bs*vi[j];
5868499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5878499736aSShri Abhyankar       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5888499736aSShri Abhyankar       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5898499736aSShri Abhyankar       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5908499736aSShri Abhyankar       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5918499736aSShri Abhyankar       v  -= bs2;
5928499736aSShri Abhyankar     }
5938499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
5948499736aSShri Abhyankar     idx += bs;
5958499736aSShri Abhyankar   }
5968499736aSShri Abhyankar   /* backward solve the L^T */
5978499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
5988499736aSShri Abhyankar     v    = aa + bs2*ai[i];
5998499736aSShri Abhyankar     vi   = aj + ai[i];
6008499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
6018499736aSShri Abhyankar     idt  = bs*i;
6028499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
6038499736aSShri Abhyankar     for(j=0;j<nz;j++){
6048499736aSShri Abhyankar       idx   = bs*vi[j];
6058499736aSShri Abhyankar       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
6068499736aSShri Abhyankar       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
6078499736aSShri Abhyankar       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
6088499736aSShri Abhyankar       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
6098499736aSShri Abhyankar       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
6108499736aSShri Abhyankar       v += bs2;
6118499736aSShri Abhyankar     }
6128499736aSShri Abhyankar   }
6133649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
6148499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
6158499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
6168499736aSShri Abhyankar   PetscFunctionReturn(0);
6178499736aSShri Abhyankar }
6188499736aSShri Abhyankar 
6198499736aSShri Abhyankar #undef __FUNCT__
62006e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
62106e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
622f1af5d2fSBarry Smith {
623f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
624dfbe8321SBarry Smith   PetscErrorCode    ierr;
625b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
626b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
627b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
628b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
629b3260449SShri Abhyankar   const PetscScalar *b;
630f1af5d2fSBarry Smith 
631f1af5d2fSBarry Smith   PetscFunctionBegin;
632ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
6333649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
6341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
635f1af5d2fSBarry Smith 
636f1af5d2fSBarry Smith   /* forward solve the U^T */
637f1af5d2fSBarry Smith   idx = 0;
638f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
639f1af5d2fSBarry Smith 
640f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
641f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
642ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
643ef66eb69SBarry Smith     x6    = x[5+idx];
644f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
645f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
646f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
647f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
648f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
649f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
650f1af5d2fSBarry Smith     v += 36;
651f1af5d2fSBarry Smith 
652f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
653f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
654f1af5d2fSBarry Smith     while (nz--) {
655f1af5d2fSBarry Smith       oidx = 6*(*vi++);
656f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
657f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
658f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
659f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
660f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
661f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
662f1af5d2fSBarry Smith       v  += 36;
663f1af5d2fSBarry Smith     }
664f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
665f1af5d2fSBarry Smith     x[5+idx] = s6;
666f1af5d2fSBarry Smith     idx += 6;
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith   /* backward solve the L^T */
669f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
670f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
671f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
672f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
673f1af5d2fSBarry Smith     idt  = 6*i;
674f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
675f1af5d2fSBarry Smith     s6 = x[5+idt];
676f1af5d2fSBarry Smith     while (nz--) {
677f1af5d2fSBarry Smith       idx   = 6*(*vi--);
678f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684f1af5d2fSBarry Smith       v -= 36;
685f1af5d2fSBarry Smith     }
686f1af5d2fSBarry Smith   }
6873649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
6881ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
689dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
690f1af5d2fSBarry Smith   PetscFunctionReturn(0);
691f1af5d2fSBarry Smith }
692f1af5d2fSBarry Smith 
6934a2ae208SSatish Balay #undef __FUNCT__
6944dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
6954dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
6968499736aSShri Abhyankar {
6978499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
6988499736aSShri Abhyankar   PetscErrorCode    ierr;
699b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
7008499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
701b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
702b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
703b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
704b3260449SShri Abhyankar   const PetscScalar *b;
7058499736aSShri Abhyankar 
7068499736aSShri Abhyankar   PetscFunctionBegin;
7078499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
7083649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
7098499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
7108499736aSShri Abhyankar 
7118499736aSShri Abhyankar   /* forward solve the U^T */
7128499736aSShri Abhyankar   idx = 0;
7138499736aSShri Abhyankar   for (i=0; i<n; i++) {
7148499736aSShri Abhyankar     v     = aa + bs2*diag[i];
7158499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
7168499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
7178499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];
7188499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
7198499736aSShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
7208499736aSShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
7218499736aSShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
7228499736aSShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
7238499736aSShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
7248499736aSShri Abhyankar     v -= bs2;
7258499736aSShri Abhyankar 
7268499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
7278499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
7288499736aSShri Abhyankar     for(j=0;j>-nz;j--){
7298499736aSShri Abhyankar       oidx = bs*vi[j];
7308499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
7318499736aSShri Abhyankar       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
7328499736aSShri Abhyankar       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7338499736aSShri Abhyankar       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7348499736aSShri Abhyankar       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7358499736aSShri Abhyankar       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7368499736aSShri Abhyankar       v  -= bs2;
7378499736aSShri Abhyankar     }
7388499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
7398499736aSShri Abhyankar     x[5+idx] = s6;
7408499736aSShri Abhyankar     idx += bs;
7418499736aSShri Abhyankar   }
7428499736aSShri Abhyankar   /* backward solve the L^T */
7438499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
7448499736aSShri Abhyankar     v    = aa + bs2*ai[i];
7458499736aSShri Abhyankar     vi   = aj + ai[i];
7468499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
7478499736aSShri Abhyankar     idt  = bs*i;
7488499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
7498499736aSShri Abhyankar     s6   = x[5+idt];
7508499736aSShri Abhyankar     for(j=0;j<nz;j++){
7518499736aSShri Abhyankar       idx   = bs*vi[j];
7528499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
7538499736aSShri Abhyankar       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
7548499736aSShri Abhyankar       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7558499736aSShri Abhyankar       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7568499736aSShri Abhyankar       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7578499736aSShri Abhyankar       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7588499736aSShri Abhyankar       v += bs2;
7598499736aSShri Abhyankar     }
7608499736aSShri Abhyankar   }
7613649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
7628499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
7638499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
7648499736aSShri Abhyankar   PetscFunctionReturn(0);
7658499736aSShri Abhyankar }
7668499736aSShri Abhyankar 
7678499736aSShri Abhyankar #undef __FUNCT__
76806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
76906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
770f1af5d2fSBarry Smith {
771f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
772dfbe8321SBarry Smith   PetscErrorCode    ierr;
773b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
774b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
775b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
776b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
777b3260449SShri Abhyankar   const PetscScalar *b;
778f1af5d2fSBarry Smith 
779f1af5d2fSBarry Smith   PetscFunctionBegin;
780ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
7813649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
7821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
783f1af5d2fSBarry Smith 
784f1af5d2fSBarry Smith   /* forward solve the U^T */
785f1af5d2fSBarry Smith   idx = 0;
786f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
787f1af5d2fSBarry Smith 
788f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
789f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
790ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
791ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
792f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
793f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
794f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
795f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
796f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
797f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
798f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
799f1af5d2fSBarry Smith     v += 49;
800f1af5d2fSBarry Smith 
801f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
802f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
803f1af5d2fSBarry Smith     while (nz--) {
804f1af5d2fSBarry Smith       oidx = 7*(*vi++);
805f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
806f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
807f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
808f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
809f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
810f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
811f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
812f1af5d2fSBarry Smith       v  += 49;
813f1af5d2fSBarry Smith     }
814f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
815f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
816f1af5d2fSBarry Smith     idx += 7;
817f1af5d2fSBarry Smith   }
818f1af5d2fSBarry Smith   /* backward solve the L^T */
819f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
820f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
821f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
822f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
823f1af5d2fSBarry Smith     idt  = 7*i;
824f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
825f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
826f1af5d2fSBarry Smith     while (nz--) {
827f1af5d2fSBarry Smith       idx   = 7*(*vi--);
828f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835f1af5d2fSBarry Smith       v -= 49;
836f1af5d2fSBarry Smith     }
837f1af5d2fSBarry Smith   }
8383649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
8391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
840dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
841f1af5d2fSBarry Smith   PetscFunctionReturn(0);
842f1af5d2fSBarry Smith }
8438499736aSShri Abhyankar #undef __FUNCT__
8444dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
8454dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
8468499736aSShri Abhyankar {
8478499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
8488499736aSShri Abhyankar   PetscErrorCode    ierr;
849b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
8508499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
851b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
852b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
853b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
854b3260449SShri Abhyankar   const PetscScalar *b;
8558499736aSShri Abhyankar 
8568499736aSShri Abhyankar   PetscFunctionBegin;
8578499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
8583649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
8598499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
8608499736aSShri Abhyankar 
8618499736aSShri Abhyankar   /* forward solve the U^T */
8628499736aSShri Abhyankar   idx = 0;
8638499736aSShri Abhyankar   for (i=0; i<n; i++) {
8648499736aSShri Abhyankar     v     = aa + bs2*diag[i];
8658499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
8668499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
8678499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
8688499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
8698499736aSShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
8708499736aSShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
8718499736aSShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
8728499736aSShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
8738499736aSShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
8748499736aSShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
8758499736aSShri Abhyankar     v -= bs2;
8768499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
8778499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
8788499736aSShri Abhyankar     for(j=0;j>-nz;j--){
8798499736aSShri Abhyankar       oidx = bs*vi[j];
8808499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8818499736aSShri Abhyankar       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8828499736aSShri Abhyankar       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8838499736aSShri Abhyankar       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8848499736aSShri Abhyankar       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8858499736aSShri Abhyankar       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8868499736aSShri Abhyankar       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8878499736aSShri Abhyankar       v  -= bs2;
8888499736aSShri Abhyankar     }
8898499736aSShri Abhyankar     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
8908499736aSShri Abhyankar     x[5+idx] = s6;  x[6+idx] = s7;
8918499736aSShri Abhyankar     idx += bs;
8928499736aSShri Abhyankar   }
8938499736aSShri Abhyankar   /* backward solve the L^T */
8948499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
8958499736aSShri Abhyankar     v    = aa + bs2*ai[i];
8968499736aSShri Abhyankar     vi   = aj + ai[i];
8978499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
8988499736aSShri Abhyankar     idt  = bs*i;
8998499736aSShri Abhyankar     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
9008499736aSShri Abhyankar     s6   = x[5+idt];  s7 = x[6+idt];
9018499736aSShri Abhyankar     for(j=0;j<nz;j++){
9028499736aSShri Abhyankar       idx   = bs*vi[j];
9038499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
9048499736aSShri Abhyankar       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
9058499736aSShri Abhyankar       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
9068499736aSShri Abhyankar       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
9078499736aSShri Abhyankar       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
9088499736aSShri Abhyankar       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
9098499736aSShri Abhyankar       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
9108499736aSShri Abhyankar       v += bs2;
9118499736aSShri Abhyankar     }
9128499736aSShri Abhyankar   }
9133649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
9148499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
9158499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
9168499736aSShri Abhyankar   PetscFunctionReturn(0);
9178499736aSShri Abhyankar }
918f1af5d2fSBarry Smith 
919f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
9204a2ae208SSatish Balay #undef __FUNCT__
92193fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
92293fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
92393fd935bSShri Abhyankar {
92493fd935bSShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
92593fd935bSShri Abhyankar   IS                iscol = a->col,isrow = a->row;
92693fd935bSShri Abhyankar   PetscErrorCode    ierr;
92793fd935bSShri Abhyankar   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
92893fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
92993fd935bSShri Abhyankar   PetscInt          nz;
93093fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
93193fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
93293fd935bSShri Abhyankar   const PetscScalar *b;
93393fd935bSShri Abhyankar 
93493fd935bSShri Abhyankar   PetscFunctionBegin;
9353649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
93693fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
93793fd935bSShri Abhyankar   tmp  = a->solve_work;
93893fd935bSShri Abhyankar 
93993fd935bSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
94093fd935bSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
94193fd935bSShri Abhyankar 
94293fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
94393fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[c[i]];
94493fd935bSShri Abhyankar 
94593fd935bSShri Abhyankar   /* forward solve the U^T */
94693fd935bSShri Abhyankar   for (i=0; i<n; i++) {
94793fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
94893fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
94993fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
95093fd935bSShri Abhyankar     s1  = tmp[i];
95193fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
95293fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
95393fd935bSShri Abhyankar     tmp[i] = s1;
95493fd935bSShri Abhyankar   }
95593fd935bSShri Abhyankar 
95693fd935bSShri Abhyankar   /* backward solve the L^T */
95793fd935bSShri Abhyankar   for (i=n-1; i>=0; i--){
95893fd935bSShri Abhyankar     v   = aa + ai[i];
95993fd935bSShri Abhyankar     vi  = aj + ai[i];
96093fd935bSShri Abhyankar     nz  = ai[i+1] - ai[i];
96193fd935bSShri Abhyankar     s1  = tmp[i];
96293fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
96393fd935bSShri Abhyankar   }
96493fd935bSShri Abhyankar 
96593fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
96693fd935bSShri Abhyankar   for (i=0; i<n; i++) x[r[i]] = tmp[i];
96793fd935bSShri Abhyankar 
96893fd935bSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
96993fd935bSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9703649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
97193fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
97293fd935bSShri Abhyankar 
97393fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
97493fd935bSShri Abhyankar   PetscFunctionReturn(0);
97593fd935bSShri Abhyankar }
97693fd935bSShri Abhyankar 
97793fd935bSShri Abhyankar #undef __FUNCT__
97806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
97906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
980f1af5d2fSBarry Smith {
981f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
982f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
9836849ba73SBarry Smith   PetscErrorCode    ierr;
9845d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
985b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
986b3260449SShri Abhyankar   PetscInt          i,nz;
987b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
988b3260449SShri Abhyankar   PetscScalar       s1,*x,*t;
989b3260449SShri Abhyankar   const PetscScalar *b;
990f1af5d2fSBarry Smith 
991f1af5d2fSBarry Smith   PetscFunctionBegin;
9923649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
9931ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
994f1af5d2fSBarry Smith   t  = a->solve_work;
995f1af5d2fSBarry Smith 
996f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
997f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
998f1af5d2fSBarry Smith 
999f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1000f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1001f1af5d2fSBarry Smith     t[i] = b[c[i]];
1002f1af5d2fSBarry Smith   }
1003f1af5d2fSBarry Smith 
1004f1af5d2fSBarry Smith   /* forward solve the U^T */
1005f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1006f1af5d2fSBarry Smith 
1007f1af5d2fSBarry Smith     v     = aa + diag[i];
1008f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1009f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
1010f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1011f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1012f1af5d2fSBarry Smith     while (nz--) {
1013f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
1014f1af5d2fSBarry Smith     }
1015f1af5d2fSBarry Smith     t[i]   = s1;
1016f1af5d2fSBarry Smith   }
1017f1af5d2fSBarry Smith   /* backward solve the L^T */
1018f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1019f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
1020f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1021f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1022f1af5d2fSBarry Smith     s1   = t[i];
1023f1af5d2fSBarry Smith     while (nz--) {
1024f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
1025f1af5d2fSBarry Smith     }
1026f1af5d2fSBarry Smith   }
1027f1af5d2fSBarry Smith 
1028f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1029f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1030f1af5d2fSBarry Smith     x[r[i]]   = t[i];
1031f1af5d2fSBarry Smith   }
1032f1af5d2fSBarry Smith 
1033f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1034f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
10353649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
10361ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1037dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
1038f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1039f1af5d2fSBarry Smith }
1040f1af5d2fSBarry Smith 
10414a2ae208SSatish Balay #undef __FUNCT__
104206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
104306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1044f1af5d2fSBarry Smith {
1045f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1046f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
10476849ba73SBarry Smith   PetscErrorCode    ierr;
10485d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1049b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1050b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1051b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1052b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1053b3260449SShri Abhyankar   const PetscScalar *b;
1054f1af5d2fSBarry Smith 
1055f1af5d2fSBarry Smith   PetscFunctionBegin;
10563649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
10571ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1058f1af5d2fSBarry Smith   t  = a->solve_work;
1059f1af5d2fSBarry Smith 
1060f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1061f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1062f1af5d2fSBarry Smith 
1063f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1064f1af5d2fSBarry Smith   ii = 0;
1065f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1066f1af5d2fSBarry Smith     ic      = 2*c[i];
1067f1af5d2fSBarry Smith     t[ii]   = b[ic];
1068f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1069f1af5d2fSBarry Smith     ii += 2;
1070f1af5d2fSBarry Smith   }
1071f1af5d2fSBarry Smith 
1072f1af5d2fSBarry Smith   /* forward solve the U^T */
1073f1af5d2fSBarry Smith   idx = 0;
1074f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1075f1af5d2fSBarry Smith 
1076f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
1077f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1078f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
1079f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
1080f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
1081f1af5d2fSBarry Smith     v += 4;
1082f1af5d2fSBarry Smith 
1083f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1084f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1085f1af5d2fSBarry Smith     while (nz--) {
1086f1af5d2fSBarry Smith       oidx = 2*(*vi++);
1087f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1088f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1089f1af5d2fSBarry Smith       v  += 4;
1090f1af5d2fSBarry Smith     }
1091f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1092f1af5d2fSBarry Smith     idx += 2;
1093f1af5d2fSBarry Smith   }
1094f1af5d2fSBarry Smith   /* backward solve the L^T */
1095f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1096f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
1097f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1098f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1099f1af5d2fSBarry Smith     idt  = 2*i;
1100f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1101f1af5d2fSBarry Smith     while (nz--) {
1102f1af5d2fSBarry Smith       idx   = 2*(*vi--);
1103f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1104f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1105f1af5d2fSBarry Smith       v -= 4;
1106f1af5d2fSBarry Smith     }
1107f1af5d2fSBarry Smith   }
1108f1af5d2fSBarry Smith 
1109f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1110f1af5d2fSBarry Smith   ii = 0;
1111f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1112f1af5d2fSBarry Smith     ir      = 2*r[i];
1113f1af5d2fSBarry Smith     x[ir]   = t[ii];
1114f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1115f1af5d2fSBarry Smith     ii += 2;
1116f1af5d2fSBarry Smith   }
1117f1af5d2fSBarry Smith 
1118f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1119f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11203649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
11211ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1122dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1123f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1124f1af5d2fSBarry Smith }
1125f1af5d2fSBarry Smith 
11264a2ae208SSatish Balay #undef __FUNCT__
11274dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
11284dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
112932121132SShri Abhyankar {
113032121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
113132121132SShri Abhyankar   PetscErrorCode    ierr;
113232121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1133b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
113432121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
113532121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1136b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1137b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1138b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1139b3260449SShri Abhyankar   const PetscScalar *b;
114032121132SShri Abhyankar 
114132121132SShri Abhyankar   PetscFunctionBegin;
11423649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
114332121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
114432121132SShri Abhyankar   t = a->solve_work;
114532121132SShri Abhyankar 
114632121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
114732121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
114832121132SShri Abhyankar 
114932121132SShri Abhyankar   /* copy b into temp work space according to permutation */
115032121132SShri Abhyankar   for(i=0;i<n;i++){
115132121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
115232121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1];
115332121132SShri Abhyankar   }
115432121132SShri Abhyankar 
115532121132SShri Abhyankar   /* forward solve the U^T */
115632121132SShri Abhyankar   idx = 0;
115732121132SShri Abhyankar   for (i=0; i<n; i++) {
115832121132SShri Abhyankar     v     = aa + bs2*diag[i];
115932121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
116032121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx];
116132121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
116232121132SShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
116332121132SShri Abhyankar     v -= bs2;
116432121132SShri Abhyankar 
116532121132SShri Abhyankar     vi    = aj + diag[i] - 1;
116632121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
116732121132SShri Abhyankar     for(j=0;j>-nz;j--){
116832121132SShri Abhyankar       oidx = bs*vi[j];
116932121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2;
117032121132SShri Abhyankar       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
117132121132SShri Abhyankar       v  -= bs2;
117232121132SShri Abhyankar     }
117332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
117432121132SShri Abhyankar     idx += bs;
117532121132SShri Abhyankar   }
117632121132SShri Abhyankar   /* backward solve the L^T */
117732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
117832121132SShri Abhyankar     v    = aa + bs2*ai[i];
117932121132SShri Abhyankar     vi   = aj + ai[i];
118032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
118132121132SShri Abhyankar     idt  = bs*i;
118232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];
118332121132SShri Abhyankar     for(j=0;j<nz;j++){
118432121132SShri Abhyankar       idx   = bs*vi[j];
118532121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2;
118632121132SShri Abhyankar       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
118732121132SShri Abhyankar       v += bs2;
118832121132SShri Abhyankar     }
118932121132SShri Abhyankar   }
119032121132SShri Abhyankar 
119132121132SShri Abhyankar   /* copy t into x according to permutation */
119232121132SShri Abhyankar   for(i=0;i<n;i++){
119332121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
119432121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1];
119532121132SShri Abhyankar   }
119632121132SShri Abhyankar 
119732121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
119832121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11993649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
120032121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
120132121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
120232121132SShri Abhyankar   PetscFunctionReturn(0);
120332121132SShri Abhyankar }
120432121132SShri Abhyankar 
120532121132SShri Abhyankar #undef __FUNCT__
120606e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
120706e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1208f1af5d2fSBarry Smith {
1209f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1210f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
12116849ba73SBarry Smith   PetscErrorCode    ierr;
12125d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1213b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1214b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1215b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1216b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1217b3260449SShri Abhyankar   const PetscScalar *b;
1218f1af5d2fSBarry Smith 
1219f1af5d2fSBarry Smith   PetscFunctionBegin;
12203649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
12211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1222f1af5d2fSBarry Smith   t  = a->solve_work;
1223f1af5d2fSBarry Smith 
1224f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1225f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1226f1af5d2fSBarry Smith 
1227f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1228f1af5d2fSBarry Smith   ii = 0;
1229f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1230f1af5d2fSBarry Smith     ic      = 3*c[i];
1231f1af5d2fSBarry Smith     t[ii]   = b[ic];
1232f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1233f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1234f1af5d2fSBarry Smith     ii += 3;
1235f1af5d2fSBarry Smith   }
1236f1af5d2fSBarry Smith 
1237f1af5d2fSBarry Smith   /* forward solve the U^T */
1238f1af5d2fSBarry Smith   idx = 0;
1239f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1240f1af5d2fSBarry Smith 
1241f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
1242f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1243f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1244f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1245f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1246f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1247f1af5d2fSBarry Smith     v += 9;
1248f1af5d2fSBarry Smith 
1249f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1250f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1251f1af5d2fSBarry Smith     while (nz--) {
1252f1af5d2fSBarry Smith       oidx = 3*(*vi++);
1253f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1254f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1255f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1256f1af5d2fSBarry Smith       v  += 9;
1257f1af5d2fSBarry Smith     }
1258f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1259f1af5d2fSBarry Smith     idx += 3;
1260f1af5d2fSBarry Smith   }
1261f1af5d2fSBarry Smith   /* backward solve the L^T */
1262f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1263f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
1264f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1265f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1266f1af5d2fSBarry Smith     idt  = 3*i;
1267f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1268f1af5d2fSBarry Smith     while (nz--) {
1269f1af5d2fSBarry Smith       idx   = 3*(*vi--);
1270f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1271f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1272f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1273f1af5d2fSBarry Smith       v -= 9;
1274f1af5d2fSBarry Smith     }
1275f1af5d2fSBarry Smith   }
1276f1af5d2fSBarry Smith 
1277f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1278f1af5d2fSBarry Smith   ii = 0;
1279f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1280f1af5d2fSBarry Smith     ir      = 3*r[i];
1281f1af5d2fSBarry Smith     x[ir]   = t[ii];
1282f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1283f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1284f1af5d2fSBarry Smith     ii += 3;
1285f1af5d2fSBarry Smith   }
1286f1af5d2fSBarry Smith 
1287f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1288f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12893649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
12901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1291dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1292f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1293f1af5d2fSBarry Smith }
1294f1af5d2fSBarry Smith 
12954a2ae208SSatish Balay #undef __FUNCT__
12964dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
12974dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
129832121132SShri Abhyankar {
129932121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
130032121132SShri Abhyankar   PetscErrorCode    ierr;
130132121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1302b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
130332121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
130432121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1305b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1306b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1307b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1308b3260449SShri Abhyankar   const PetscScalar *b;
130932121132SShri Abhyankar 
131032121132SShri Abhyankar   PetscFunctionBegin;
13113649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
131232121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131332121132SShri Abhyankar   t = a->solve_work;
131432121132SShri Abhyankar 
131532121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
131632121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
131732121132SShri Abhyankar 
131832121132SShri Abhyankar   /* copy b into temp work space according to permutation */
131932121132SShri Abhyankar   for(i=0;i<n;i++){
132032121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
132132121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
132232121132SShri Abhyankar   }
132332121132SShri Abhyankar 
132432121132SShri Abhyankar   /* forward solve the U^T */
132532121132SShri Abhyankar   idx = 0;
132632121132SShri Abhyankar   for (i=0; i<n; i++) {
132732121132SShri Abhyankar     v     = aa + bs2*diag[i];
132832121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
132932121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
133032121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
133132121132SShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
133232121132SShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
133332121132SShri Abhyankar     v -= bs2;
133432121132SShri Abhyankar 
133532121132SShri Abhyankar     vi    = aj + diag[i] - 1;
133632121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
133732121132SShri Abhyankar     for(j=0;j>-nz;j--){
133832121132SShri Abhyankar       oidx = bs*vi[j];
133932121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
134032121132SShri Abhyankar       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
134132121132SShri Abhyankar       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
134232121132SShri Abhyankar       v  -= bs2;
134332121132SShri Abhyankar     }
134432121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
134532121132SShri Abhyankar     idx += bs;
134632121132SShri Abhyankar   }
134732121132SShri Abhyankar   /* backward solve the L^T */
134832121132SShri Abhyankar   for (i=n-1; i>=0; i--){
134932121132SShri Abhyankar     v    = aa + bs2*ai[i];
135032121132SShri Abhyankar     vi   = aj + ai[i];
135132121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
135232121132SShri Abhyankar     idt  = bs*i;
135332121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
135432121132SShri Abhyankar     for(j=0;j<nz;j++){
135532121132SShri Abhyankar       idx   = bs*vi[j];
135632121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
135732121132SShri Abhyankar       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
135832121132SShri Abhyankar       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
135932121132SShri Abhyankar       v += bs2;
136032121132SShri Abhyankar     }
136132121132SShri Abhyankar   }
136232121132SShri Abhyankar 
136332121132SShri Abhyankar   /* copy t into x according to permutation */
136432121132SShri Abhyankar   for(i=0;i<n;i++){
136532121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
136632121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
136732121132SShri Abhyankar   }
136832121132SShri Abhyankar 
136932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
137032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13713649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
137232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
137332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
137432121132SShri Abhyankar   PetscFunctionReturn(0);
137532121132SShri Abhyankar }
137632121132SShri Abhyankar 
137732121132SShri Abhyankar #undef __FUNCT__
137806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
137906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1380f1af5d2fSBarry Smith {
1381f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1382f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
13836849ba73SBarry Smith   PetscErrorCode    ierr;
13845d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1385b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1386b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1387b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1388b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1389b3260449SShri Abhyankar   const PetscScalar *b;
1390f1af5d2fSBarry Smith 
1391f1af5d2fSBarry Smith   PetscFunctionBegin;
13923649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
13931ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1394f1af5d2fSBarry Smith   t  = a->solve_work;
1395f1af5d2fSBarry Smith 
1396f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1397f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1398f1af5d2fSBarry Smith 
1399f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1400f1af5d2fSBarry Smith   ii = 0;
1401f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1402f1af5d2fSBarry Smith     ic      = 4*c[i];
1403f1af5d2fSBarry Smith     t[ii]   = b[ic];
1404f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1405f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1406f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1407f1af5d2fSBarry Smith     ii += 4;
1408f1af5d2fSBarry Smith   }
1409f1af5d2fSBarry Smith 
1410f1af5d2fSBarry Smith   /* forward solve the U^T */
1411f1af5d2fSBarry Smith   idx = 0;
1412f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1413f1af5d2fSBarry Smith 
1414f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
1415f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1416f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1417f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1418f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1419f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1420f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1421f1af5d2fSBarry Smith     v += 16;
1422f1af5d2fSBarry Smith 
1423f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1424f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1425f1af5d2fSBarry Smith     while (nz--) {
1426f1af5d2fSBarry Smith       oidx = 4*(*vi++);
1427f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1428f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1429f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1430f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1431f1af5d2fSBarry Smith       v  += 16;
1432f1af5d2fSBarry Smith     }
1433f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1434f1af5d2fSBarry Smith     idx += 4;
1435f1af5d2fSBarry Smith   }
1436f1af5d2fSBarry Smith   /* backward solve the L^T */
1437f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1438f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
1439f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1440f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1441f1af5d2fSBarry Smith     idt  = 4*i;
1442f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1443f1af5d2fSBarry Smith     while (nz--) {
1444f1af5d2fSBarry Smith       idx   = 4*(*vi--);
1445f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1446f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1447f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1448f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1449f1af5d2fSBarry Smith       v -= 16;
1450f1af5d2fSBarry Smith     }
1451f1af5d2fSBarry Smith   }
1452f1af5d2fSBarry Smith 
1453f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1454f1af5d2fSBarry Smith   ii = 0;
1455f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1456f1af5d2fSBarry Smith     ir      = 4*r[i];
1457f1af5d2fSBarry Smith     x[ir]   = t[ii];
1458f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1459f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1460f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1461f1af5d2fSBarry Smith     ii += 4;
1462f1af5d2fSBarry Smith   }
1463f1af5d2fSBarry Smith 
1464f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1465f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
14663649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
14671ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1468dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1469f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1470f1af5d2fSBarry Smith }
1471f1af5d2fSBarry Smith 
14724a2ae208SSatish Balay #undef __FUNCT__
14734dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
14744dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
147532121132SShri Abhyankar {
147632121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
147732121132SShri Abhyankar   PetscErrorCode    ierr;
147832121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1479b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
148032121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
148132121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1482b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1483b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1484b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1485b3260449SShri Abhyankar   const PetscScalar *b;
148632121132SShri Abhyankar 
148732121132SShri Abhyankar   PetscFunctionBegin;
14883649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
148932121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
149032121132SShri Abhyankar   t = a->solve_work;
149132121132SShri Abhyankar 
149232121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
149332121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
149432121132SShri Abhyankar 
149532121132SShri Abhyankar   /* copy b into temp work space according to permutation */
149632121132SShri Abhyankar   for(i=0;i<n;i++){
149732121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
149832121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
149932121132SShri Abhyankar   }
150032121132SShri Abhyankar 
150132121132SShri Abhyankar   /* forward solve the U^T */
150232121132SShri Abhyankar   idx = 0;
150332121132SShri Abhyankar   for (i=0; i<n; i++) {
150432121132SShri Abhyankar     v     = aa + bs2*diag[i];
150532121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
150632121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
150732121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
150832121132SShri Abhyankar     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
150932121132SShri Abhyankar     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
151032121132SShri Abhyankar     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
151132121132SShri Abhyankar     v -= bs2;
151232121132SShri Abhyankar 
151332121132SShri Abhyankar     vi    = aj + diag[i] - 1;
151432121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
151532121132SShri Abhyankar     for(j=0;j>-nz;j--){
151632121132SShri Abhyankar       oidx = bs*vi[j];
151732121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
151832121132SShri Abhyankar       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
151932121132SShri Abhyankar       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
152032121132SShri Abhyankar       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
152132121132SShri Abhyankar       v  -= bs2;
152232121132SShri Abhyankar     }
152332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
152432121132SShri Abhyankar     idx += bs;
152532121132SShri Abhyankar   }
152632121132SShri Abhyankar   /* backward solve the L^T */
152732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
152832121132SShri Abhyankar     v    = aa + bs2*ai[i];
152932121132SShri Abhyankar     vi   = aj + ai[i];
153032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
153132121132SShri Abhyankar     idt  = bs*i;
153232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
153332121132SShri Abhyankar     for(j=0;j<nz;j++){
153432121132SShri Abhyankar       idx   = bs*vi[j];
153532121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
153632121132SShri Abhyankar       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
153732121132SShri Abhyankar       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
153832121132SShri Abhyankar       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
153932121132SShri Abhyankar       v += bs2;
154032121132SShri Abhyankar     }
154132121132SShri Abhyankar   }
154232121132SShri Abhyankar 
154332121132SShri Abhyankar   /* copy t into x according to permutation */
154432121132SShri Abhyankar   for(i=0;i<n;i++){
154532121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
154632121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
154732121132SShri Abhyankar   }
154832121132SShri Abhyankar 
154932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
155032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
15513649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
155232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
155332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
155432121132SShri Abhyankar   PetscFunctionReturn(0);
155532121132SShri Abhyankar }
155632121132SShri Abhyankar 
155732121132SShri Abhyankar #undef __FUNCT__
155806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
155906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1560f1af5d2fSBarry Smith {
1561f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1562f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
15636849ba73SBarry Smith   PetscErrorCode    ierr;
15645d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1565b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1566b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1567b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1568b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1569b3260449SShri Abhyankar   const PetscScalar *b;
1570f1af5d2fSBarry Smith 
1571f1af5d2fSBarry Smith   PetscFunctionBegin;
15723649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
15731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1574f1af5d2fSBarry Smith   t  = a->solve_work;
1575f1af5d2fSBarry Smith 
1576f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1577f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1578f1af5d2fSBarry Smith 
1579f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1580f1af5d2fSBarry Smith   ii = 0;
1581f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1582f1af5d2fSBarry Smith     ic      = 5*c[i];
1583f1af5d2fSBarry Smith     t[ii]   = b[ic];
1584f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1585f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1586f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1587f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1588f1af5d2fSBarry Smith     ii += 5;
1589f1af5d2fSBarry Smith   }
1590f1af5d2fSBarry Smith 
1591f1af5d2fSBarry Smith   /* forward solve the U^T */
1592f1af5d2fSBarry Smith   idx = 0;
1593f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1594f1af5d2fSBarry Smith 
1595f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
1596f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1597f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1598f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1599f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1600f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1601f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1602f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1603f1af5d2fSBarry Smith     v += 25;
1604f1af5d2fSBarry Smith 
1605f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1606f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1607f1af5d2fSBarry Smith     while (nz--) {
1608f1af5d2fSBarry Smith       oidx = 5*(*vi++);
1609f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1610f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1611f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1612f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1613f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1614f1af5d2fSBarry Smith       v  += 25;
1615f1af5d2fSBarry Smith     }
1616f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1617f1af5d2fSBarry Smith     idx += 5;
1618f1af5d2fSBarry Smith   }
1619f1af5d2fSBarry Smith   /* backward solve the L^T */
1620f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1621f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
1622f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1623f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1624f1af5d2fSBarry Smith     idt  = 5*i;
1625f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1626f1af5d2fSBarry Smith     while (nz--) {
1627f1af5d2fSBarry Smith       idx   = 5*(*vi--);
1628f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1629f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1630f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1631f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1632f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1633f1af5d2fSBarry Smith       v -= 25;
1634f1af5d2fSBarry Smith     }
1635f1af5d2fSBarry Smith   }
1636f1af5d2fSBarry Smith 
1637f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1638f1af5d2fSBarry Smith   ii = 0;
1639f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1640f1af5d2fSBarry Smith     ir      = 5*r[i];
1641f1af5d2fSBarry Smith     x[ir]   = t[ii];
1642f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1643f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1644f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1645f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1646f1af5d2fSBarry Smith     ii += 5;
1647f1af5d2fSBarry Smith   }
1648f1af5d2fSBarry Smith 
1649f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1650f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
16513649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
16521ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1653dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1654f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1655f1af5d2fSBarry Smith }
1656f1af5d2fSBarry Smith 
16574a2ae208SSatish Balay #undef __FUNCT__
16584dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
16594dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
166032121132SShri Abhyankar {
166132121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
166232121132SShri Abhyankar   PetscErrorCode    ierr;
166332121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1664b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
166532121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
166632121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1667b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1668b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1669b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1670b3260449SShri Abhyankar   const PetscScalar *b;
167132121132SShri Abhyankar 
167232121132SShri Abhyankar   PetscFunctionBegin;
16733649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
167432121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
167532121132SShri Abhyankar   t = a->solve_work;
167632121132SShri Abhyankar 
167732121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
167832121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
167932121132SShri Abhyankar 
168032121132SShri Abhyankar   /* copy b into temp work space according to permutation */
168132121132SShri Abhyankar   for(i=0;i<n;i++){
168232121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
168332121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
168432121132SShri Abhyankar     t[ii+4] = b[ic+4];
168532121132SShri Abhyankar   }
168632121132SShri Abhyankar 
168732121132SShri Abhyankar   /* forward solve the U^T */
168832121132SShri Abhyankar   idx = 0;
168932121132SShri Abhyankar   for (i=0; i<n; i++) {
169032121132SShri Abhyankar     v     = aa + bs2*diag[i];
169132121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
169232121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
169332121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
169432121132SShri Abhyankar     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
169532121132SShri Abhyankar     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
169632121132SShri Abhyankar     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
169732121132SShri Abhyankar     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
169832121132SShri Abhyankar     v -= bs2;
169932121132SShri Abhyankar 
170032121132SShri Abhyankar     vi    = aj + diag[i] - 1;
170132121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
170232121132SShri Abhyankar     for(j=0;j>-nz;j--){
170332121132SShri Abhyankar       oidx = bs*vi[j];
170432121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
170532121132SShri Abhyankar       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
170632121132SShri Abhyankar       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
170732121132SShri Abhyankar       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
170832121132SShri Abhyankar       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
170932121132SShri Abhyankar       v  -= bs2;
171032121132SShri Abhyankar     }
171132121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
171232121132SShri Abhyankar     idx += bs;
171332121132SShri Abhyankar   }
171432121132SShri Abhyankar   /* backward solve the L^T */
171532121132SShri Abhyankar   for (i=n-1; i>=0; i--){
171632121132SShri Abhyankar     v    = aa + bs2*ai[i];
171732121132SShri Abhyankar     vi   = aj + ai[i];
171832121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
171932121132SShri Abhyankar     idt  = bs*i;
172032121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
172132121132SShri Abhyankar     for(j=0;j<nz;j++){
172232121132SShri Abhyankar       idx   = bs*vi[j];
172332121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
172432121132SShri Abhyankar       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
172532121132SShri Abhyankar       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
172632121132SShri Abhyankar       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
172732121132SShri Abhyankar       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
172832121132SShri Abhyankar       v += bs2;
172932121132SShri Abhyankar     }
173032121132SShri Abhyankar   }
173132121132SShri Abhyankar 
173232121132SShri Abhyankar   /* copy t into x according to permutation */
173332121132SShri Abhyankar   for(i=0;i<n;i++){
173432121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
173532121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
173632121132SShri Abhyankar     x[ir+4] = t[ii+4];
173732121132SShri Abhyankar   }
173832121132SShri Abhyankar 
173932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
174032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
17413649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
174232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
174332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
174432121132SShri Abhyankar   PetscFunctionReturn(0);
174532121132SShri Abhyankar }
174632121132SShri Abhyankar 
174732121132SShri Abhyankar #undef __FUNCT__
174806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
174906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1750f1af5d2fSBarry Smith {
1751f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1752f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
17536849ba73SBarry Smith   PetscErrorCode    ierr;
17545d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1755b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1756b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1757b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1758b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1759b3260449SShri Abhyankar   const PetscScalar *b;
1760f1af5d2fSBarry Smith 
1761f1af5d2fSBarry Smith   PetscFunctionBegin;
17623649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
17631ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1764f1af5d2fSBarry Smith   t  = a->solve_work;
1765f1af5d2fSBarry Smith 
1766f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1767f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1768f1af5d2fSBarry Smith 
1769f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1770f1af5d2fSBarry Smith   ii = 0;
1771f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1772f1af5d2fSBarry Smith     ic      = 6*c[i];
1773f1af5d2fSBarry Smith     t[ii]   = b[ic];
1774f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1775f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1776f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1777f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1778f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1779f1af5d2fSBarry Smith     ii += 6;
1780f1af5d2fSBarry Smith   }
1781f1af5d2fSBarry Smith 
1782f1af5d2fSBarry Smith   /* forward solve the U^T */
1783f1af5d2fSBarry Smith   idx = 0;
1784f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1785f1af5d2fSBarry Smith 
1786f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
1787f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1788f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1789f1af5d2fSBarry Smith     x6    = t[5+idx];
1790f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1791f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1792f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1793f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1794f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1795f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1796f1af5d2fSBarry Smith     v += 36;
1797f1af5d2fSBarry Smith 
1798f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1799f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1800f1af5d2fSBarry Smith     while (nz--) {
1801f1af5d2fSBarry Smith       oidx = 6*(*vi++);
1802f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1803f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1804f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1805f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1806f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1807f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1808f1af5d2fSBarry Smith       v  += 36;
1809f1af5d2fSBarry Smith     }
1810f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1811f1af5d2fSBarry Smith     t[5+idx] = s6;
1812f1af5d2fSBarry Smith     idx += 6;
1813f1af5d2fSBarry Smith   }
1814f1af5d2fSBarry Smith   /* backward solve the L^T */
1815f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1816f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
1817f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1818f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1819f1af5d2fSBarry Smith     idt  = 6*i;
1820f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1821f1af5d2fSBarry Smith     s6 = t[5+idt];
1822f1af5d2fSBarry Smith     while (nz--) {
1823f1af5d2fSBarry Smith       idx   = 6*(*vi--);
1824f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1825f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1826f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1827f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1828f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1829f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1830f1af5d2fSBarry Smith       v -= 36;
1831f1af5d2fSBarry Smith     }
1832f1af5d2fSBarry Smith   }
1833f1af5d2fSBarry Smith 
1834f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1835f1af5d2fSBarry Smith   ii = 0;
1836f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1837f1af5d2fSBarry Smith     ir      = 6*r[i];
1838f1af5d2fSBarry Smith     x[ir]   = t[ii];
1839f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1840f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1841f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1842f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1843f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1844f1af5d2fSBarry Smith     ii += 6;
1845f1af5d2fSBarry Smith   }
1846f1af5d2fSBarry Smith 
1847f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1848f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18493649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
18501ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1851dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1852f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1853f1af5d2fSBarry Smith }
1854f1af5d2fSBarry Smith 
18554a2ae208SSatish Balay #undef __FUNCT__
18564dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
18574dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
185832121132SShri Abhyankar {
185932121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
186032121132SShri Abhyankar   PetscErrorCode    ierr;
186132121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1862b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
186332121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
186432121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1865b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1866b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1867b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1868b3260449SShri Abhyankar   const PetscScalar *b;
186932121132SShri Abhyankar 
187032121132SShri Abhyankar   PetscFunctionBegin;
18713649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
187232121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
187332121132SShri Abhyankar   t = a->solve_work;
187432121132SShri Abhyankar 
187532121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
187632121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
187732121132SShri Abhyankar 
187832121132SShri Abhyankar   /* copy b into temp work space according to permutation */
187932121132SShri Abhyankar   for(i=0;i<n;i++){
188032121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
188132121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
188232121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
188332121132SShri Abhyankar   }
188432121132SShri Abhyankar 
188532121132SShri Abhyankar   /* forward solve the U^T */
188632121132SShri Abhyankar   idx = 0;
188732121132SShri Abhyankar   for (i=0; i<n; i++) {
188832121132SShri Abhyankar     v     = aa + bs2*diag[i];
188932121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
189032121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
189132121132SShri Abhyankar     x6    = t[5+idx];
189232121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
189332121132SShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
189432121132SShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
189532121132SShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
189632121132SShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
189732121132SShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
189832121132SShri Abhyankar     v -= bs2;
189932121132SShri Abhyankar 
190032121132SShri Abhyankar     vi    = aj + diag[i] - 1;
190132121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
190232121132SShri Abhyankar     for(j=0;j>-nz;j--){
190332121132SShri Abhyankar       oidx = bs*vi[j];
190432121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
190532121132SShri Abhyankar       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
190632121132SShri Abhyankar       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
190732121132SShri Abhyankar       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
190832121132SShri Abhyankar       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
190932121132SShri Abhyankar       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
191032121132SShri Abhyankar       v  -= bs2;
191132121132SShri Abhyankar     }
191232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
191332121132SShri Abhyankar     t[5+idx] = s6;
191432121132SShri Abhyankar     idx += bs;
191532121132SShri Abhyankar   }
191632121132SShri Abhyankar   /* backward solve the L^T */
191732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
191832121132SShri Abhyankar     v    = aa + bs2*ai[i];
191932121132SShri Abhyankar     vi   = aj + ai[i];
192032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
192132121132SShri Abhyankar     idt  = bs*i;
192232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
192332121132SShri Abhyankar     s6   = t[5+idt];
192432121132SShri Abhyankar    for(j=0;j<nz;j++){
192532121132SShri Abhyankar       idx   = bs*vi[j];
192632121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
192732121132SShri Abhyankar       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
192832121132SShri Abhyankar       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
192932121132SShri Abhyankar       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
193032121132SShri Abhyankar       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
193132121132SShri Abhyankar       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
193232121132SShri Abhyankar       v += bs2;
193332121132SShri Abhyankar     }
193432121132SShri Abhyankar   }
193532121132SShri Abhyankar 
193632121132SShri Abhyankar   /* copy t into x according to permutation */
193732121132SShri Abhyankar   for(i=0;i<n;i++){
193832121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
193932121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
194032121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
194132121132SShri Abhyankar   }
194232121132SShri Abhyankar 
194332121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
194432121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
19453649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
194632121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
194732121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
194832121132SShri Abhyankar   PetscFunctionReturn(0);
194932121132SShri Abhyankar }
195032121132SShri Abhyankar 
195132121132SShri Abhyankar #undef __FUNCT__
195206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
195306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1954f1af5d2fSBarry Smith {
1955f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1956f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
19576849ba73SBarry Smith   PetscErrorCode    ierr;
19585d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1959b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1960b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1961b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1962b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1963b3260449SShri Abhyankar   const PetscScalar *b;
1964f1af5d2fSBarry Smith 
1965f1af5d2fSBarry Smith   PetscFunctionBegin;
19663649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
19671ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1968f1af5d2fSBarry Smith   t  = a->solve_work;
1969f1af5d2fSBarry Smith 
1970f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1971f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1972f1af5d2fSBarry Smith 
1973f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1974f1af5d2fSBarry Smith   ii = 0;
1975f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1976f1af5d2fSBarry Smith     ic      = 7*c[i];
1977f1af5d2fSBarry Smith     t[ii]   = b[ic];
1978f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1979f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1980f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1981f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1982f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1983f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1984f1af5d2fSBarry Smith     ii += 7;
1985f1af5d2fSBarry Smith   }
1986f1af5d2fSBarry Smith 
1987f1af5d2fSBarry Smith   /* forward solve the U^T */
1988f1af5d2fSBarry Smith   idx = 0;
1989f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1990f1af5d2fSBarry Smith 
1991f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1992f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1993f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1994f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1995f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1996f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1997f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1998f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1999f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2000f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2001f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2002f1af5d2fSBarry Smith     v += 49;
2003f1af5d2fSBarry Smith 
2004f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
2005f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
2006f1af5d2fSBarry Smith     while (nz--) {
2007f1af5d2fSBarry Smith       oidx = 7*(*vi++);
2008f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2009f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2010f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2011f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2012f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2013f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2014f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2015f1af5d2fSBarry Smith       v  += 49;
2016f1af5d2fSBarry Smith     }
2017f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2018f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
2019f1af5d2fSBarry Smith     idx += 7;
2020f1af5d2fSBarry Smith   }
2021f1af5d2fSBarry Smith   /* backward solve the L^T */
2022f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
2023f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
2024f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
2025f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
2026f1af5d2fSBarry Smith     idt  = 7*i;
2027f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2028f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
2029f1af5d2fSBarry Smith     while (nz--) {
2030f1af5d2fSBarry Smith       idx   = 7*(*vi--);
2031f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2032f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2033f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2034f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2035f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2036f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2037f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2038f1af5d2fSBarry Smith       v -= 49;
2039f1af5d2fSBarry Smith     }
2040f1af5d2fSBarry Smith   }
2041f1af5d2fSBarry Smith 
2042f1af5d2fSBarry Smith   /* copy t into x according to permutation */
2043f1af5d2fSBarry Smith   ii = 0;
2044f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
2045f1af5d2fSBarry Smith     ir      = 7*r[i];
2046f1af5d2fSBarry Smith     x[ir]   = t[ii];
2047f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
2048f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
2049f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
2050f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
2051f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
2052f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
2053f1af5d2fSBarry Smith     ii += 7;
2054f1af5d2fSBarry Smith   }
2055f1af5d2fSBarry Smith 
2056f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2057f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20583649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
20591ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2060dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2061f1af5d2fSBarry Smith   PetscFunctionReturn(0);
2062f1af5d2fSBarry Smith }
206332121132SShri Abhyankar #undef __FUNCT__
20644dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
20654dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
206632121132SShri Abhyankar {
206732121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
206832121132SShri Abhyankar   PetscErrorCode    ierr;
206932121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
2070b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
207132121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
207232121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2073b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2074b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2075b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2076b3260449SShri Abhyankar   const PetscScalar *b;
207732121132SShri Abhyankar 
207832121132SShri Abhyankar   PetscFunctionBegin;
20793649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
208032121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
208132121132SShri Abhyankar   t = a->solve_work;
208232121132SShri Abhyankar 
208332121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
208432121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
208532121132SShri Abhyankar 
208632121132SShri Abhyankar   /* copy b into temp work space according to permutation */
208732121132SShri Abhyankar   for(i=0;i<n;i++){
208832121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
208932121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
209032121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
209132121132SShri Abhyankar   }
209232121132SShri Abhyankar 
209332121132SShri Abhyankar   /* forward solve the U^T */
209432121132SShri Abhyankar   idx = 0;
209532121132SShri Abhyankar   for (i=0; i<n; i++) {
209632121132SShri Abhyankar     v     = aa + bs2*diag[i];
209732121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
209832121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
209932121132SShri Abhyankar     x6    = t[5+idx]; x7 = t[6+idx];
210032121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
210132121132SShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
210232121132SShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
210332121132SShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
210432121132SShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
210532121132SShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
210632121132SShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
210732121132SShri Abhyankar     v -= bs2;
210832121132SShri Abhyankar 
210932121132SShri Abhyankar     vi    = aj + diag[i] - 1;
211032121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
211132121132SShri Abhyankar     for(j=0;j>-nz;j--){
211232121132SShri Abhyankar       oidx = bs*vi[j];
211332121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
211432121132SShri Abhyankar       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
211532121132SShri Abhyankar       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
211632121132SShri Abhyankar       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
211732121132SShri Abhyankar       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
211832121132SShri Abhyankar       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
211932121132SShri Abhyankar       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
212032121132SShri Abhyankar       v  -= bs2;
212132121132SShri Abhyankar     }
212232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
212332121132SShri Abhyankar     t[5+idx] = s6;  t[6+idx] = s7;
212432121132SShri Abhyankar     idx += bs;
212532121132SShri Abhyankar   }
212632121132SShri Abhyankar   /* backward solve the L^T */
212732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
212832121132SShri Abhyankar     v    = aa + bs2*ai[i];
212932121132SShri Abhyankar     vi   = aj + ai[i];
213032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
213132121132SShri Abhyankar     idt  = bs*i;
213232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
213332121132SShri Abhyankar     s6   = t[5+idt];  s7 = t[6+idt];
213432121132SShri Abhyankar    for(j=0;j<nz;j++){
213532121132SShri Abhyankar       idx   = bs*vi[j];
213632121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
213732121132SShri Abhyankar       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
213832121132SShri Abhyankar       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
213932121132SShri Abhyankar       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
214032121132SShri Abhyankar       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
214132121132SShri Abhyankar       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
214232121132SShri Abhyankar       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
214332121132SShri Abhyankar       v += bs2;
214432121132SShri Abhyankar     }
214532121132SShri Abhyankar   }
214632121132SShri Abhyankar 
214732121132SShri Abhyankar   /* copy t into x according to permutation */
214832121132SShri Abhyankar   for(i=0;i<n;i++){
214932121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
215032121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
215132121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
215232121132SShri Abhyankar   }
215332121132SShri Abhyankar 
215432121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
215532121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21563649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
215732121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
215832121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
215932121132SShri Abhyankar   PetscFunctionReturn(0);
216032121132SShri Abhyankar }
2161f1af5d2fSBarry Smith 
21624e2b4712SSatish Balay /* ----------------------------------------------------------- */
21634a2ae208SSatish Balay #undef __FUNCT__
216406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
216506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21664e2b4712SSatish Balay {
21674e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21684e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
21696849ba73SBarry Smith   PetscErrorCode    ierr;
2170b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2171b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2172b3260449SShri Abhyankar   PetscInt          i,nz;
2173b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2174b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2175b3260449SShri Abhyankar   PetscScalar       *x,*s,*t,*ls;
2176b3260449SShri Abhyankar   const PetscScalar *b;
21774e2b4712SSatish Balay 
21784e2b4712SSatish Balay   PetscFunctionBegin;
21793649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21801ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2181f1af5d2fSBarry Smith   t  = a->solve_work;
21824e2b4712SSatish Balay 
21834e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21844e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
21854e2b4712SSatish Balay 
21864e2b4712SSatish Balay   /* forward solve the lower triangular */
218787828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21884e2b4712SSatish Balay   for (i=1; i<n; i++) {
21894e2b4712SSatish Balay     v   = aa + bs2*ai[i];
21904e2b4712SSatish Balay     vi  = aj + ai[i];
21914e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
2192f1af5d2fSBarry Smith     s = t + bs*i;
219387828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21944e2b4712SSatish Balay     while (nz--) {
2195f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
21964e2b4712SSatish Balay       v += bs2;
21974e2b4712SSatish Balay     }
21984e2b4712SSatish Balay   }
21994e2b4712SSatish Balay   /* backward solve the upper triangular */
2200d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
22014e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
22024e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
22034e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
22044e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
220587828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22064e2b4712SSatish Balay     while (nz--) {
2207f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
22084e2b4712SSatish Balay       v += bs2;
22094e2b4712SSatish Balay     }
2210f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
221187828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22124e2b4712SSatish Balay   }
22134e2b4712SSatish Balay 
22144e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22154e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22163649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
22171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2218dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
22194e2b4712SSatish Balay   PetscFunctionReturn(0);
22204e2b4712SSatish Balay }
22214e2b4712SSatish Balay 
22225c42ef9dSBarry Smith /* ----------------------------------------------------------- */
22235c42ef9dSBarry Smith #undef __FUNCT__
222406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
222506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
22265c42ef9dSBarry Smith {
22275c42ef9dSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
22285c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
22295c42ef9dSBarry Smith   PetscErrorCode    ierr;
22305c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2231b3260449SShri Abhyankar   PetscInt          i,nz,j;
2232b3260449SShri Abhyankar   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
22335c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
22345c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
22355c42ef9dSBarry Smith   const PetscScalar *b;
22365c42ef9dSBarry Smith   PetscFunctionBegin;
22373649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
22385c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22395c42ef9dSBarry Smith   t    = a->solve_work;
22405c42ef9dSBarry Smith 
22415c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22425c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22435c42ef9dSBarry Smith 
22445c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
22455c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22465c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22475c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
22485c42ef9dSBarry Smith     }
22495c42ef9dSBarry Smith   }
22505c42ef9dSBarry Smith 
22515c42ef9dSBarry Smith 
22525c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
22535c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
22545c42ef9dSBarry Smith   for (i=0; i<n; i++){
22555c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22565c42ef9dSBarry Smith     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
22575c42ef9dSBarry Smith     v   = aa + bs2*(a->diag[i] + 1);
22585c42ef9dSBarry Smith     vi  = aj + a->diag[i] + 1;
22595c42ef9dSBarry Smith     nz  = ai[i+1] - a->diag[i] - 1;
22605c42ef9dSBarry Smith     while (nz--) {
22615c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22625c42ef9dSBarry Smith       v += bs2;
22635c42ef9dSBarry Smith     }
22645c42ef9dSBarry Smith   }
22655c42ef9dSBarry Smith 
22665c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
22675c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
22685c42ef9dSBarry Smith     v   = aa + bs2*ai[i];
22695c42ef9dSBarry Smith     vi  = aj + ai[i];
22705c42ef9dSBarry Smith     nz  = a->diag[i] - ai[i];
22715c42ef9dSBarry Smith     while (nz--) {
22725c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22735c42ef9dSBarry Smith       v += bs2;
22745c42ef9dSBarry Smith     }
22755c42ef9dSBarry Smith   }
22765c42ef9dSBarry Smith 
22775c42ef9dSBarry Smith   /* copy t into x according to permutation */
22785c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22795c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22805c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
22815c42ef9dSBarry Smith     }
22825c42ef9dSBarry Smith   }
22835c42ef9dSBarry Smith 
22845c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22855c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22863649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
22875c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22885c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
22895c42ef9dSBarry Smith   PetscFunctionReturn(0);
22905c42ef9dSBarry Smith }
22915c42ef9dSBarry Smith 
22924a2ae208SSatish Balay #undef __FUNCT__
22934dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
22944dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
22958499736aSShri Abhyankar {
22968499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
22978499736aSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
22988499736aSShri Abhyankar   PetscErrorCode    ierr;
2299b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2300b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2301b3260449SShri Abhyankar   PetscInt          i,j,nz;
2302b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
23038499736aSShri Abhyankar   const MatScalar   *aa=a->a,*v;
23048499736aSShri Abhyankar   PetscScalar       *x,*t,*ls;
23058499736aSShri Abhyankar   const PetscScalar *b;
2306b3260449SShri Abhyankar 
23078499736aSShri Abhyankar   PetscFunctionBegin;
23083649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
23098499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23108499736aSShri Abhyankar   t    = a->solve_work;
23118499736aSShri Abhyankar 
23128499736aSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
23138499736aSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
23148499736aSShri Abhyankar 
23158499736aSShri Abhyankar   /* copy the b into temp work space according to permutation */
23168499736aSShri Abhyankar   for (i=0; i<n; i++) {
23178499736aSShri Abhyankar     for (j=0; j<bs; j++) {
23188499736aSShri Abhyankar       t[i*bs+j] = b[c[i]*bs+j];
23198499736aSShri Abhyankar     }
23208499736aSShri Abhyankar   }
23218499736aSShri Abhyankar 
23228499736aSShri Abhyankar 
23238499736aSShri Abhyankar   /* forward solve the upper triangular transpose */
23248499736aSShri Abhyankar   ls = a->solve_work + A->cmap->n;
23258499736aSShri Abhyankar   for (i=0; i<n; i++){
23268499736aSShri Abhyankar     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
23278499736aSShri Abhyankar     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
23288499736aSShri Abhyankar     v   = aa + bs2*(diag[i] - 1);
23298499736aSShri Abhyankar     vi  = aj + diag[i] - 1;
23308499736aSShri Abhyankar     nz  = diag[i] - diag[i+1] - 1;
23318499736aSShri Abhyankar     for(j=0;j>-nz;j--){
23328499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
23338499736aSShri Abhyankar       v -= bs2;
23348499736aSShri Abhyankar     }
23358499736aSShri Abhyankar   }
23368499736aSShri Abhyankar 
23378499736aSShri Abhyankar   /* backward solve the lower triangular transpose */
23388499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
23398499736aSShri Abhyankar     v   = aa + bs2*ai[i];
23408499736aSShri Abhyankar     vi  = aj + ai[i];
23418499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
23428499736aSShri Abhyankar     for(j=0;j<nz;j++){
23438499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
23448499736aSShri Abhyankar       v += bs2;
23458499736aSShri Abhyankar     }
23468499736aSShri Abhyankar   }
23478499736aSShri Abhyankar 
23488499736aSShri Abhyankar   /* copy t into x according to permutation */
23498499736aSShri Abhyankar   for (i=0; i<n; i++) {
23508499736aSShri Abhyankar     for (j=0; j<bs; j++) {
23518499736aSShri Abhyankar       x[bs*r[i]+j]   = t[bs*i+j];
23528499736aSShri Abhyankar     }
23538499736aSShri Abhyankar   }
23548499736aSShri Abhyankar 
23558499736aSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23568499736aSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23573649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
23588499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
23598499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
23608499736aSShri Abhyankar   PetscFunctionReturn(0);
23618499736aSShri Abhyankar }
23628499736aSShri Abhyankar 
2363832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
236429a97285SShri Abhyankar 
23652b0b2ea7SShri Abhyankar #undef __FUNCT__
2366832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2367832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
23682b0b2ea7SShri Abhyankar {
23692b0b2ea7SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
23702b0b2ea7SShri Abhyankar   PetscErrorCode    ierr;
2371b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
23720fa040f9SShri Abhyankar   PetscInt          i,nz,idx,idt,m;
23730b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
23742b0b2ea7SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
23752b0b2ea7SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
23760fa040f9SShri Abhyankar   PetscScalar       *x;
23770b68f018SBarry Smith   const PetscScalar *b;
23782b0b2ea7SShri Abhyankar 
23792b0b2ea7SShri Abhyankar   PetscFunctionBegin;
23803649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
23812b0b2ea7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23822b0b2ea7SShri Abhyankar 
23832b0b2ea7SShri Abhyankar   /* forward solve the lower triangular */
238429a97285SShri Abhyankar   idx    = 0;
23850fa040f9SShri Abhyankar   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
23860fa040f9SShri Abhyankar   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
23870fa040f9SShri Abhyankar   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
23882b0b2ea7SShri Abhyankar 
23892b0b2ea7SShri Abhyankar   for (i=1; i<n; i++) {
23902b0b2ea7SShri Abhyankar     v     = aa + bs2*ai[i];
23912b0b2ea7SShri Abhyankar     vi    = aj + ai[i];
23922b0b2ea7SShri Abhyankar     nz    = ai[i+1] - ai[i];
23930fa040f9SShri Abhyankar     idt   = bs*i;
23940fa040f9SShri Abhyankar     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
23950fa040f9SShri Abhyankar     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
23960fa040f9SShri Abhyankar     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
23972b0b2ea7SShri Abhyankar     for(m=0;m<nz;m++){
23982b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
23990fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
24000fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
24010fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
24022b0b2ea7SShri Abhyankar 
24030b8f6341SShri Abhyankar 
24042b0b2ea7SShri Abhyankar       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
24052b0b2ea7SShri Abhyankar       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
24062b0b2ea7SShri Abhyankar       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
24072b0b2ea7SShri Abhyankar       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
24082b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
24092b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
24102b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
24112b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
24122b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
24132b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
24142b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
24152b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
24162b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
24172b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
24182b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
24192b0b2ea7SShri Abhyankar 
24202b0b2ea7SShri Abhyankar       v += bs2;
24212b0b2ea7SShri Abhyankar     }
24220fa040f9SShri Abhyankar     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
24230fa040f9SShri Abhyankar     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
24240fa040f9SShri Abhyankar     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
24252b0b2ea7SShri Abhyankar 
24262b0b2ea7SShri Abhyankar   }
24272b0b2ea7SShri Abhyankar   /* backward solve the upper triangular */
24282b0b2ea7SShri Abhyankar   for (i=n-1; i>=0; i--){
24292b0b2ea7SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
24302b0b2ea7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
24312b0b2ea7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
24322b0b2ea7SShri Abhyankar     idt  = bs*i;
24330fa040f9SShri Abhyankar     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
24340fa040f9SShri Abhyankar     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
24350fa040f9SShri Abhyankar     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
24362b0b2ea7SShri Abhyankar 
24372b0b2ea7SShri Abhyankar     for(m=0;m<nz;m++){
24382b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
24390fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
24400fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
24410fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
24422b0b2ea7SShri Abhyankar 
24432b0b2ea7SShri Abhyankar       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
24442b0b2ea7SShri Abhyankar       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
24452b0b2ea7SShri Abhyankar       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
24462b0b2ea7SShri Abhyankar       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
24472b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
24482b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
24492b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
24502b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
24512b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
24522b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
24532b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
24542b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
24552b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
24562b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
24572b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
24582b0b2ea7SShri Abhyankar 
24592b0b2ea7SShri Abhyankar       v += bs2;
24602b0b2ea7SShri Abhyankar     }
24612b0b2ea7SShri Abhyankar 
24620fa040f9SShri Abhyankar     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
24630fa040f9SShri Abhyankar     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
24640fa040f9SShri Abhyankar     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
24650fa040f9SShri Abhyankar     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
24660fa040f9SShri Abhyankar     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
24670fa040f9SShri Abhyankar     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
24680fa040f9SShri Abhyankar     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
24690fa040f9SShri Abhyankar     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
24700fa040f9SShri Abhyankar     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
24710fa040f9SShri Abhyankar     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
24720fa040f9SShri Abhyankar     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
24730fa040f9SShri Abhyankar     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
24740fa040f9SShri Abhyankar     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
24750fa040f9SShri Abhyankar     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
24760fa040f9SShri Abhyankar     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
24772b0b2ea7SShri Abhyankar 
24782b0b2ea7SShri Abhyankar   }
24792b0b2ea7SShri Abhyankar 
24803649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
24812b0b2ea7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24822b0b2ea7SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
24832b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
24842b0b2ea7SShri Abhyankar }
24852b0b2ea7SShri Abhyankar 
2486832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2487832cc040SShri Abhyankar /* Default MatSolve for block size 15 */
2488832cc040SShri Abhyankar 
24898499736aSShri Abhyankar #undef __FUNCT__
2490832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2491832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
24920b8f6341SShri Abhyankar {
24930b8f6341SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
24940b8f6341SShri Abhyankar   PetscErrorCode    ierr;
24950b8f6341SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
249653ef36baSBarry Smith   PetscInt          i,k,nz,idx,idt,m;
24970b8f6341SShri Abhyankar   const MatScalar   *aa=a->a,*v;
24980b8f6341SShri Abhyankar   PetscScalar       s[15];
249953ef36baSBarry Smith   PetscScalar       *x,xv;
25000b8f6341SShri Abhyankar   const PetscScalar *b;
25010b8f6341SShri Abhyankar 
25020b8f6341SShri Abhyankar   PetscFunctionBegin;
25033649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
25040b8f6341SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25050b8f6341SShri Abhyankar 
25060b8f6341SShri Abhyankar   /* forward solve the lower triangular */
2507832cc040SShri Abhyankar   for (i=0; i<n; i++) {
25080b8f6341SShri Abhyankar     v     = aa + bs2*ai[i];
25090b8f6341SShri Abhyankar     vi    = aj + ai[i];
25100b8f6341SShri Abhyankar     nz    = ai[i+1] - ai[i];
25110fa040f9SShri Abhyankar     idt   = bs*i;
2512832cc040SShri Abhyankar     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2513832cc040SShri Abhyankar     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2514832cc040SShri Abhyankar     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
25150b8f6341SShri Abhyankar     for(m=0;m<nz;m++){
25160b8f6341SShri Abhyankar       idx   = bs*vi[m];
25170b8f6341SShri Abhyankar       for(k=0;k<15;k++){
251853ef36baSBarry Smith 	xv        = x[k + idx];
251953ef36baSBarry Smith 	x[idt]    -= v[0]*xv;
252053ef36baSBarry Smith 	x[1+idt]  -= v[1]*xv;
252153ef36baSBarry Smith 	x[2+idt]  -= v[2]*xv;
252253ef36baSBarry Smith         x[3+idt]  -= v[3]*xv;
252353ef36baSBarry Smith 	x[4+idt]  -= v[4]*xv;
252453ef36baSBarry Smith 	x[5+idt]  -= v[5]*xv;
252553ef36baSBarry Smith 	x[6+idt]  -= v[6]*xv;
252653ef36baSBarry Smith         x[7+idt]  -= v[7]*xv;
252753ef36baSBarry Smith 	x[8+idt]  -= v[8]*xv;
252853ef36baSBarry Smith 	x[9+idt]  -= v[9]*xv;
252953ef36baSBarry Smith 	x[10+idt] -= v[10]*xv;
253053ef36baSBarry Smith         x[11+idt] -= v[11]*xv;
253153ef36baSBarry Smith 	x[12+idt] -= v[12]*xv;
253253ef36baSBarry Smith 	x[13+idt] -= v[13]*xv;
253353ef36baSBarry Smith 	x[14+idt] -= v[14]*xv;
25340b8f6341SShri Abhyankar 	v += 15;
25350b8f6341SShri Abhyankar       }
25360b8f6341SShri Abhyankar     }
25370b8f6341SShri Abhyankar   }
25380b8f6341SShri Abhyankar   /* backward solve the upper triangular */
25390b8f6341SShri Abhyankar   for (i=n-1; i>=0; i--){
25400b8f6341SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
25410b8f6341SShri Abhyankar     vi   = aj + adiag[i+1]+1;
25420b8f6341SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
25430b8f6341SShri Abhyankar     idt  = bs*i;
25440fa040f9SShri Abhyankar     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
25450fa040f9SShri Abhyankar     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
25460fa040f9SShri Abhyankar     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
25470b8f6341SShri Abhyankar 
25480b8f6341SShri Abhyankar     for(m=0;m<nz;m++){
25490b8f6341SShri Abhyankar       idx   = bs*vi[m];
25500b8f6341SShri Abhyankar       for(k=0;k<15;k++){
255153ef36baSBarry Smith 	xv = x[k + idx];
255253ef36baSBarry Smith 	s[0]  -= v[0]*xv;
255353ef36baSBarry Smith 	s[1]  -= v[1]*xv;
255453ef36baSBarry Smith 	s[2]  -= v[2]*xv;
255553ef36baSBarry Smith         s[3]  -= v[3]*xv;
255653ef36baSBarry Smith 	s[4]  -= v[4]*xv;
255753ef36baSBarry Smith 	s[5]  -= v[5]*xv;
255853ef36baSBarry Smith 	s[6]  -= v[6]*xv;
255953ef36baSBarry Smith         s[7]  -= v[7]*xv;
256053ef36baSBarry Smith 	s[8]  -= v[8]*xv;
256153ef36baSBarry Smith 	s[9]  -= v[9]*xv;
256253ef36baSBarry Smith 	s[10] -= v[10]*xv;
256353ef36baSBarry Smith         s[11] -= v[11]*xv;
256453ef36baSBarry Smith 	s[12] -= v[12]*xv;
256553ef36baSBarry Smith 	s[13] -= v[13]*xv;
256653ef36baSBarry Smith 	s[14] -= v[14]*xv;
25670b8f6341SShri Abhyankar 	v += 15;
25680b8f6341SShri Abhyankar       }
25690b8f6341SShri Abhyankar     }
25700fa040f9SShri Abhyankar     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
25710b8f6341SShri Abhyankar     for(k=0;k<15;k++){
25720fa040f9SShri Abhyankar       x[idt]    += v[0]*s[k];
25730fa040f9SShri Abhyankar       x[1+idt]  += v[1]*s[k];
25740fa040f9SShri Abhyankar       x[2+idt]  += v[2]*s[k];
25750fa040f9SShri Abhyankar       x[3+idt]  += v[3]*s[k];
25760fa040f9SShri Abhyankar       x[4+idt]  += v[4]*s[k];
25770fa040f9SShri Abhyankar       x[5+idt]  += v[5]*s[k];
25780fa040f9SShri Abhyankar       x[6+idt]  += v[6]*s[k];
25790fa040f9SShri Abhyankar       x[7+idt]  += v[7]*s[k];
25800fa040f9SShri Abhyankar       x[8+idt]  += v[8]*s[k];
25810fa040f9SShri Abhyankar       x[9+idt]  += v[9]*s[k];
25820fa040f9SShri Abhyankar       x[10+idt] += v[10]*s[k];
25830fa040f9SShri Abhyankar       x[11+idt] += v[11]*s[k];
25840fa040f9SShri Abhyankar       x[12+idt] += v[12]*s[k];
25850fa040f9SShri Abhyankar       x[13+idt] += v[13]*s[k];
25860fa040f9SShri Abhyankar       x[14+idt] += v[14]*s[k];
25870b8f6341SShri Abhyankar       v += 15;
25880b8f6341SShri Abhyankar     }
25890b8f6341SShri Abhyankar   }
25903649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
25910b8f6341SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
25920b8f6341SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
25930b8f6341SShri Abhyankar   PetscFunctionReturn(0);
25940b8f6341SShri Abhyankar }
25950b8f6341SShri Abhyankar 
25960b8f6341SShri Abhyankar 
25970b8f6341SShri Abhyankar #undef __FUNCT__
259806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
259906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
26004e2b4712SSatish Balay {
26014e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
26024e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
26036849ba73SBarry Smith   PetscErrorCode    ierr;
2604b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2605b3260449SShri Abhyankar   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2606b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2607b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2608b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2609b3260449SShri Abhyankar   const PetscScalar *b;
26104e2b4712SSatish Balay 
26114e2b4712SSatish Balay   PetscFunctionBegin;
26123649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
26131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2614f1af5d2fSBarry Smith   t  = a->solve_work;
26154e2b4712SSatish Balay 
26164e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
26174e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
26184e2b4712SSatish Balay 
26194e2b4712SSatish Balay   /* forward solve the lower triangular */
26204e2b4712SSatish Balay   idx    = 7*(*r++);
2621f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2622f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2623f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
26244e2b4712SSatish Balay 
26254e2b4712SSatish Balay   for (i=1; i<n; i++) {
26264e2b4712SSatish Balay     v     = aa + 49*ai[i];
26274e2b4712SSatish Balay     vi    = aj + ai[i];
26284e2b4712SSatish Balay     nz    = diag[i] - ai[i];
26294e2b4712SSatish Balay     idx   = 7*(*r++);
2630f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2631f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
26324e2b4712SSatish Balay     while (nz--) {
26334e2b4712SSatish Balay       idx   = 7*(*vi++);
2634f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2635f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2636f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
2637f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2638f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2639f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2640f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2641f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2642f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2643f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26444e2b4712SSatish Balay       v += 49;
26454e2b4712SSatish Balay     }
26464e2b4712SSatish Balay     idx = 7*i;
2647f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2648f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2649f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
26504e2b4712SSatish Balay   }
26514e2b4712SSatish Balay   /* backward solve the upper triangular */
26524e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
26534e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
26544e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
26554e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
26564e2b4712SSatish Balay     idt  = 7*i;
2657f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2658f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2659f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
26604e2b4712SSatish Balay     while (nz--) {
26614e2b4712SSatish Balay       idx   = 7*(*vi++);
2662f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2663f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2664f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
2665f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2666f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2667f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2668f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2669f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2670f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2671f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26724e2b4712SSatish Balay       v += 49;
26734e2b4712SSatish Balay     }
26744e2b4712SSatish Balay     idc = 7*(*c--);
26754e2b4712SSatish Balay     v   = aa + 49*diag[i];
2676f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2677f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2678f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2679f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2680f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2681f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2682f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2683f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2684f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2685f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2686f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2687f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2688f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2689f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
26904e2b4712SSatish Balay   }
26914e2b4712SSatish Balay 
26924e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
26934e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
26943649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
26951ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2696dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
26974e2b4712SSatish Balay   PetscFunctionReturn(0);
26984e2b4712SSatish Balay }
26994e2b4712SSatish Balay 
27008f690400SShri Abhyankar #undef __FUNCT__
27014dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7"
27024dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
270335aa4fcfSShri Abhyankar {
270435aa4fcfSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
270535aa4fcfSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
270635aa4fcfSShri Abhyankar   PetscErrorCode    ierr;
2707b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2708b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2709b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
2710b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2711b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2712b3260449SShri Abhyankar   const PetscScalar *b;
271335aa4fcfSShri Abhyankar 
271435aa4fcfSShri Abhyankar   PetscFunctionBegin;
27153649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
271635aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
271735aa4fcfSShri Abhyankar   t  = a->solve_work;
271835aa4fcfSShri Abhyankar 
271935aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
272035aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
272135aa4fcfSShri Abhyankar 
272235aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
272335aa4fcfSShri Abhyankar   idx    = 7*r[0];
272435aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
272535aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
272635aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
272735aa4fcfSShri Abhyankar 
272835aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
272935aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
273035aa4fcfSShri Abhyankar     vi    = aj + ai[i];
273135aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
273235aa4fcfSShri Abhyankar     idx   = 7*r[i];
273335aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
273435aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
273535aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
273635aa4fcfSShri Abhyankar       idx   = 7*vi[m];
273735aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
273835aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
273935aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
274035aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
274135aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
274235aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
274335aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
274435aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
274535aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
274635aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
274735aa4fcfSShri Abhyankar       v += 49;
274835aa4fcfSShri Abhyankar     }
274935aa4fcfSShri Abhyankar     idx = 7*i;
275035aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
275135aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
275235aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
275335aa4fcfSShri Abhyankar   }
275435aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
275535aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
275635aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
275735aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
275835aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
275935aa4fcfSShri Abhyankar     idt  = 7*i;
276035aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
276135aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
276235aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
276335aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
276435aa4fcfSShri Abhyankar       idx   = 7*vi[m];
276535aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
276635aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
276735aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
276835aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
276935aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
277035aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
277135aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
277235aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
277335aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
277435aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
277535aa4fcfSShri Abhyankar       v += 49;
277635aa4fcfSShri Abhyankar     }
277735aa4fcfSShri Abhyankar     idc = 7*c[i];
277835aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
277935aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
278035aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
278135aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
278235aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
278335aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
278435aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
278535aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
278635aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
278735aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
278835aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
278935aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
279035aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
279135aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
279235aa4fcfSShri Abhyankar   }
279335aa4fcfSShri Abhyankar 
279435aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
279535aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
27963649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
279735aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
279835aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
279935aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
280035aa4fcfSShri Abhyankar }
280135aa4fcfSShri Abhyankar 
280235aa4fcfSShri Abhyankar #undef __FUNCT__
280306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
280406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
280515091d37SBarry Smith {
280615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2807b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2808dfbe8321SBarry Smith   PetscErrorCode    ierr;
2809b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
2810d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2811d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2812d9fead3dSBarry Smith   const PetscScalar *b;
281315091d37SBarry Smith 
281415091d37SBarry Smith   PetscFunctionBegin;
28153649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
28161ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
281715091d37SBarry Smith   /* forward solve the lower triangular */
281815091d37SBarry Smith   idx    = 0;
281915091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
282015091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
282115091d37SBarry Smith   x[6] = b[6+idx];
282215091d37SBarry Smith   for (i=1; i<n; i++) {
282315091d37SBarry Smith     v     =  aa + 49*ai[i];
282415091d37SBarry Smith     vi    =  aj + ai[i];
282515091d37SBarry Smith     nz    =  diag[i] - ai[i];
282615091d37SBarry Smith     idx   =  7*i;
2827f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2828f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2829f1af5d2fSBarry Smith     s7  =  b[6+idx];
283015091d37SBarry Smith     while (nz--) {
283115091d37SBarry Smith       jdx   = 7*(*vi++);
283215091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
283315091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
283415091d37SBarry Smith       x7    = x[6+jdx];
2835f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2836f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2837f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2838f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2839f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2840f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2841f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
284215091d37SBarry Smith       v += 49;
284315091d37SBarry Smith      }
2844f1af5d2fSBarry Smith     x[idx]   = s1;
2845f1af5d2fSBarry Smith     x[1+idx] = s2;
2846f1af5d2fSBarry Smith     x[2+idx] = s3;
2847f1af5d2fSBarry Smith     x[3+idx] = s4;
2848f1af5d2fSBarry Smith     x[4+idx] = s5;
2849f1af5d2fSBarry Smith     x[5+idx] = s6;
2850f1af5d2fSBarry Smith     x[6+idx] = s7;
285115091d37SBarry Smith   }
285215091d37SBarry Smith   /* backward solve the upper triangular */
285315091d37SBarry Smith   for (i=n-1; i>=0; i--){
285415091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
285515091d37SBarry Smith     vi   = aj + diag[i] + 1;
285615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
285715091d37SBarry Smith     idt  = 7*i;
2858f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2859f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2860f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
2861f1af5d2fSBarry Smith     s7 = x[6+idt];
286215091d37SBarry Smith     while (nz--) {
286315091d37SBarry Smith       idx   = 7*(*vi++);
286415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
286515091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
286615091d37SBarry Smith       x7    = x[6+idx];
2867f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2868f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2869f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2870f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2871f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2872f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2873f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
287415091d37SBarry Smith       v += 49;
287515091d37SBarry Smith     }
287615091d37SBarry Smith     v        = aa + 49*diag[i];
2877f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2878f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2879f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2880f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2881f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2882f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2883f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2884f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2885f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2886f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2887f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2888f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2889f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2890f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
289115091d37SBarry Smith   }
289215091d37SBarry Smith 
28933649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
28941ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2895dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
289615091d37SBarry Smith   PetscFunctionReturn(0);
289715091d37SBarry Smith }
289815091d37SBarry Smith 
2899cee9d6f2SShri Abhyankar #undef __FUNCT__
29004dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
29014dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
290253cca76cSShri Abhyankar {
290353cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2904b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
290553cca76cSShri Abhyankar     PetscErrorCode    ierr;
2906b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
2907b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
290853cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
290953cca76cSShri Abhyankar     PetscScalar       *x;
291053cca76cSShri Abhyankar     const PetscScalar *b;
291153cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
291253cca76cSShri Abhyankar 
291353cca76cSShri Abhyankar     PetscFunctionBegin;
29143649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
291553cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
291653cca76cSShri Abhyankar     /* forward solve the lower triangular */
291753cca76cSShri Abhyankar     idx    = 0;
291853cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
291953cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
292053cca76cSShri Abhyankar     for (i=1; i<n; i++) {
292153cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
292253cca76cSShri Abhyankar        vi   = aj + ai[i];
292353cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
292453cca76cSShri Abhyankar       idx   = bs*i;
292553cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
292653cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
292753cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
292853cca76cSShri Abhyankar           jdx   = bs*vi[k];
292953cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
293053cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
293153cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
293253cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
293353cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
293453cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
293553cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
293653cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
293753cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
293853cca76cSShri Abhyankar           v   +=  bs2;
293953cca76cSShri Abhyankar         }
294053cca76cSShri Abhyankar 
294153cca76cSShri Abhyankar        x[idx]   = s1;
294253cca76cSShri Abhyankar        x[1+idx] = s2;
294353cca76cSShri Abhyankar        x[2+idx] = s3;
294453cca76cSShri Abhyankar        x[3+idx] = s4;
294553cca76cSShri Abhyankar        x[4+idx] = s5;
294653cca76cSShri Abhyankar        x[5+idx] = s6;
294753cca76cSShri Abhyankar        x[6+idx] = s7;
294853cca76cSShri Abhyankar     }
294953cca76cSShri Abhyankar 
295053cca76cSShri Abhyankar    /* backward solve the upper triangular */
295153cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
295253cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
295353cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
295453cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
295553cca76cSShri Abhyankar      idt = bs*i;
295653cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
295753cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
295853cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
295953cca76cSShri Abhyankar       idx   = bs*vi[k];
296053cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
296153cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
296253cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
296353cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
296453cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
296553cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
296653cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
296753cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
296853cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
296953cca76cSShri Abhyankar         v   +=  bs2;
297053cca76cSShri Abhyankar     }
297153cca76cSShri Abhyankar     /* x = inv_diagonal*x */
297253cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
297353cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
297453cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
297553cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
297653cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
297753cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
297853cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
297953cca76cSShri Abhyankar   }
298053cca76cSShri Abhyankar 
29813649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
298253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
298353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
298453cca76cSShri Abhyankar   PetscFunctionReturn(0);
298553cca76cSShri Abhyankar }
298653cca76cSShri Abhyankar 
298753cca76cSShri Abhyankar #undef __FUNCT__
298806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
298906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
299015091d37SBarry Smith {
299115091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
299215091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
29936849ba73SBarry Smith   PetscErrorCode    ierr;
29945d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
2995b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2996b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2997d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2998d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2999d9fead3dSBarry Smith   const PetscScalar *b;
3000b3260449SShri Abhyankar 
300115091d37SBarry Smith   PetscFunctionBegin;
30023649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
30031ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3004f1af5d2fSBarry Smith   t  = a->solve_work;
300515091d37SBarry Smith 
300615091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
300715091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
300815091d37SBarry Smith 
300915091d37SBarry Smith   /* forward solve the lower triangular */
301015091d37SBarry Smith   idx    = 6*(*r++);
3011f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3012f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
3013f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
301415091d37SBarry Smith   for (i=1; i<n; i++) {
301515091d37SBarry Smith     v     = aa + 36*ai[i];
301615091d37SBarry Smith     vi    = aj + ai[i];
301715091d37SBarry Smith     nz    = diag[i] - ai[i];
301815091d37SBarry Smith     idx   = 6*(*r++);
3019f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3020f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
302115091d37SBarry Smith     while (nz--) {
302215091d37SBarry Smith       idx   = 6*(*vi++);
3023f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3024f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3025f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3026f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3027f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3028f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3029f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3030f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
303115091d37SBarry Smith       v += 36;
303215091d37SBarry Smith     }
303315091d37SBarry Smith     idx = 6*i;
3034f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3035f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
3036f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
303715091d37SBarry Smith   }
303815091d37SBarry Smith   /* backward solve the upper triangular */
303915091d37SBarry Smith   for (i=n-1; i>=0; i--){
304015091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
304115091d37SBarry Smith     vi   = aj + diag[i] + 1;
304215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
304315091d37SBarry Smith     idt  = 6*i;
3044f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3045f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
3046f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
304715091d37SBarry Smith     while (nz--) {
304815091d37SBarry Smith       idx   = 6*(*vi++);
3049f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3050f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3051f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
3052f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3053f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3054f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3055f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3056f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3057f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
305815091d37SBarry Smith       v += 36;
305915091d37SBarry Smith     }
306015091d37SBarry Smith     idc = 6*(*c--);
306115091d37SBarry Smith     v   = aa + 36*diag[i];
3062f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3063f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
3064f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3065f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
3066f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3067f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
3068f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3069f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
3070f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3071f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
3072f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3073f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
307415091d37SBarry Smith   }
307515091d37SBarry Smith 
307615091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
307715091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30783649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
30791ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3080dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
308115091d37SBarry Smith   PetscFunctionReturn(0);
308215091d37SBarry Smith }
308315091d37SBarry Smith 
30846506fda5SShri Abhyankar #undef __FUNCT__
30854dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6"
30864dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
30876506fda5SShri Abhyankar {
30886506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
30896506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
30906506fda5SShri Abhyankar   PetscErrorCode    ierr;
30916506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3092b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3093b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
30946506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
30956506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
30966506fda5SShri Abhyankar   const PetscScalar *b;
3097b3260449SShri Abhyankar 
30986506fda5SShri Abhyankar   PetscFunctionBegin;
30993649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
31006506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
31016506fda5SShri Abhyankar   t  = a->solve_work;
31026506fda5SShri Abhyankar 
31036506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
31046506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
31056506fda5SShri Abhyankar 
31066506fda5SShri Abhyankar   /* forward solve the lower triangular */
31076506fda5SShri Abhyankar   idx    = 6*r[0];
31086506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
31096506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
31106506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
31116506fda5SShri Abhyankar   for (i=1; i<n; i++) {
31126506fda5SShri Abhyankar     v     = aa + 36*ai[i];
31136506fda5SShri Abhyankar     vi    = aj + ai[i];
31146506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
31156506fda5SShri Abhyankar     idx   = 6*r[i];
31166506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
31176506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
31186506fda5SShri Abhyankar     for(m=0;m<nz;m++){
31196506fda5SShri Abhyankar       idx   = 6*vi[m];
31206506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
31216506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
31226506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
31236506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
31246506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
31256506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
31266506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
31276506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
31286506fda5SShri Abhyankar       v += 36;
31296506fda5SShri Abhyankar     }
31306506fda5SShri Abhyankar     idx = 6*i;
31316506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
31326506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
31336506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
31346506fda5SShri Abhyankar   }
31356506fda5SShri Abhyankar   /* backward solve the upper triangular */
31366506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
31376506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
31386506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
31396506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
31406506fda5SShri Abhyankar     idt  = 6*i;
31416506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
31426506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
31436506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
31446506fda5SShri Abhyankar     for(m=0;m<nz;m++){
31456506fda5SShri Abhyankar       idx   = 6*vi[m];
31466506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
31476506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
31486506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
31496506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
31506506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
31516506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
31526506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
31536506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
31546506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
31556506fda5SShri Abhyankar       v += 36;
31566506fda5SShri Abhyankar     }
31576506fda5SShri Abhyankar     idc = 6*c[i];
31586506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
31596506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
31606506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
31616506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
31626506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
31636506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
31646506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
31656506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
31666506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
31676506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
31686506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
31696506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
31706506fda5SShri Abhyankar   }
31716506fda5SShri Abhyankar 
31726506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
31736506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
31743649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
31756506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
31766506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
31776506fda5SShri Abhyankar   PetscFunctionReturn(0);
31786506fda5SShri Abhyankar }
31798f690400SShri Abhyankar 
31808f690400SShri Abhyankar #undef __FUNCT__
318106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
318206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
318315091d37SBarry Smith {
318415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3185b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3186dfbe8321SBarry Smith   PetscErrorCode    ierr;
3187b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3188d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3189d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3190d9fead3dSBarry Smith   const PetscScalar *b;
319115091d37SBarry Smith 
319215091d37SBarry Smith   PetscFunctionBegin;
31933649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
31941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
319515091d37SBarry Smith   /* forward solve the lower triangular */
319615091d37SBarry Smith   idx    = 0;
319715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
319815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
319915091d37SBarry Smith   for (i=1; i<n; i++) {
320015091d37SBarry Smith     v     =  aa + 36*ai[i];
320115091d37SBarry Smith     vi    =  aj + ai[i];
320215091d37SBarry Smith     nz    =  diag[i] - ai[i];
320315091d37SBarry Smith     idx   =  6*i;
3204f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3205f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
320615091d37SBarry Smith     while (nz--) {
320715091d37SBarry Smith       jdx   = 6*(*vi++);
320815091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
320915091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3210f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3211f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3212f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3213f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3214f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3215f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
321615091d37SBarry Smith       v += 36;
321715091d37SBarry Smith      }
3218f1af5d2fSBarry Smith     x[idx]   = s1;
3219f1af5d2fSBarry Smith     x[1+idx] = s2;
3220f1af5d2fSBarry Smith     x[2+idx] = s3;
3221f1af5d2fSBarry Smith     x[3+idx] = s4;
3222f1af5d2fSBarry Smith     x[4+idx] = s5;
3223f1af5d2fSBarry Smith     x[5+idx] = s6;
322415091d37SBarry Smith   }
322515091d37SBarry Smith   /* backward solve the upper triangular */
322615091d37SBarry Smith   for (i=n-1; i>=0; i--){
322715091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
322815091d37SBarry Smith     vi   = aj + diag[i] + 1;
322915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
323015091d37SBarry Smith     idt  = 6*i;
3231f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
3232f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
3233f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
323415091d37SBarry Smith     while (nz--) {
323515091d37SBarry Smith       idx   = 6*(*vi++);
323615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
323715091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3238f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3239f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3240f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3241f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3242f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3243f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
324415091d37SBarry Smith       v += 36;
324515091d37SBarry Smith     }
324615091d37SBarry Smith     v        = aa + 36*diag[i];
3247f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3248f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3249f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3250f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3251f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3252f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
325315091d37SBarry Smith   }
325415091d37SBarry Smith 
32553649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
32561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3257dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
325815091d37SBarry Smith   PetscFunctionReturn(0);
325915091d37SBarry Smith }
326015091d37SBarry Smith 
3261cee9d6f2SShri Abhyankar #undef __FUNCT__
32624dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
32634dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
326453cca76cSShri Abhyankar {
326553cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3266b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
326753cca76cSShri Abhyankar     PetscErrorCode    ierr;
3268b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
3269b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
327053cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
327153cca76cSShri Abhyankar     PetscScalar       *x;
327253cca76cSShri Abhyankar     const PetscScalar *b;
327353cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
327453cca76cSShri Abhyankar 
327553cca76cSShri Abhyankar     PetscFunctionBegin;
32763649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
327753cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
327853cca76cSShri Abhyankar     /* forward solve the lower triangular */
327953cca76cSShri Abhyankar     idx    = 0;
328053cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
328153cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
328253cca76cSShri Abhyankar     for (i=1; i<n; i++) {
328353cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
328453cca76cSShri Abhyankar        vi   = aj + ai[i];
328553cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
328653cca76cSShri Abhyankar       idx   = bs*i;
328753cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
328853cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
328953cca76cSShri Abhyankar        for(k=0;k<nz;k++){
329053cca76cSShri Abhyankar           jdx   = bs*vi[k];
329153cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
329253cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
329353cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
329453cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
329553cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
329653cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
329753cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
329853cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
329953cca76cSShri Abhyankar           v   +=  bs2;
330053cca76cSShri Abhyankar         }
330153cca76cSShri Abhyankar 
330253cca76cSShri Abhyankar        x[idx]   = s1;
330353cca76cSShri Abhyankar        x[1+idx] = s2;
330453cca76cSShri Abhyankar        x[2+idx] = s3;
330553cca76cSShri Abhyankar        x[3+idx] = s4;
330653cca76cSShri Abhyankar        x[4+idx] = s5;
330753cca76cSShri Abhyankar        x[5+idx] = s6;
330853cca76cSShri Abhyankar     }
330953cca76cSShri Abhyankar 
331053cca76cSShri Abhyankar    /* backward solve the upper triangular */
331153cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
331253cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
331353cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
331453cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
331553cca76cSShri Abhyankar      idt = bs*i;
331653cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
331753cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
331853cca76cSShri Abhyankar      for(k=0;k<nz;k++){
331953cca76cSShri Abhyankar       idx   = bs*vi[k];
332053cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
332153cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
332253cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
332353cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
332453cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
332553cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
332653cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
332753cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
332853cca76cSShri Abhyankar         v   +=  bs2;
332953cca76cSShri Abhyankar     }
333053cca76cSShri Abhyankar     /* x = inv_diagonal*x */
333153cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
333253cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
333353cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
333453cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
333553cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
333653cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
333753cca76cSShri Abhyankar   }
333853cca76cSShri Abhyankar 
33393649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
334053cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
334153cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
334253cca76cSShri Abhyankar   PetscFunctionReturn(0);
334353cca76cSShri Abhyankar }
334453cca76cSShri Abhyankar 
334553cca76cSShri Abhyankar #undef __FUNCT__
334606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
334706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
33484e2b4712SSatish Balay {
33494e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
33504e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
33516849ba73SBarry Smith   PetscErrorCode    ierr;
33525d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3353b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3354b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
3355d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3356d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3357d9fead3dSBarry Smith   const PetscScalar *b;
33584e2b4712SSatish Balay 
33594e2b4712SSatish Balay   PetscFunctionBegin;
33603649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
33611ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3362f1af5d2fSBarry Smith   t  = a->solve_work;
33634e2b4712SSatish Balay 
33644e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
33654e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
33664e2b4712SSatish Balay 
33674e2b4712SSatish Balay   /* forward solve the lower triangular */
33684e2b4712SSatish Balay   idx    = 5*(*r++);
3369f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3370f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
33714e2b4712SSatish Balay   for (i=1; i<n; i++) {
33724e2b4712SSatish Balay     v     = aa + 25*ai[i];
33734e2b4712SSatish Balay     vi    = aj + ai[i];
33744e2b4712SSatish Balay     nz    = diag[i] - ai[i];
33754e2b4712SSatish Balay     idx   = 5*(*r++);
3376f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3377f1af5d2fSBarry Smith     s5  = b[4+idx];
33784e2b4712SSatish Balay     while (nz--) {
33794e2b4712SSatish Balay       idx   = 5*(*vi++);
3380f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3381f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
3382f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3383f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3384f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3385f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3386f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
33874e2b4712SSatish Balay       v += 25;
33884e2b4712SSatish Balay     }
33894e2b4712SSatish Balay     idx = 5*i;
3390f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3391f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
33924e2b4712SSatish Balay   }
33934e2b4712SSatish Balay   /* backward solve the upper triangular */
33944e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
33954e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
33964e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
33974e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
33984e2b4712SSatish Balay     idt  = 5*i;
3399f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3400f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
34014e2b4712SSatish Balay     while (nz--) {
34024e2b4712SSatish Balay       idx   = 5*(*vi++);
3403f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3404f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3405f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3406f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3407f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3408f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3409f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
34104e2b4712SSatish Balay       v += 25;
34114e2b4712SSatish Balay     }
34124e2b4712SSatish Balay     idc = 5*(*c--);
34134e2b4712SSatish Balay     v   = aa + 25*diag[i];
3414f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3415f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
3416f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3417f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
3418f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3419f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
3420f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3421f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
3422f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3423f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
34244e2b4712SSatish Balay   }
34254e2b4712SSatish Balay 
34264e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
34274e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
34283649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
34291ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3430dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
34314e2b4712SSatish Balay   PetscFunctionReturn(0);
34324e2b4712SSatish Balay }
34334e2b4712SSatish Balay 
343478bb4007SShri Abhyankar #undef __FUNCT__
34354dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5"
34364dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
343778bb4007SShri Abhyankar {
343878bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
343978bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
344078bb4007SShri Abhyankar   PetscErrorCode    ierr;
344178bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3442b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3443b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
344478bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
344578bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
344678bb4007SShri Abhyankar   const PetscScalar *b;
344778bb4007SShri Abhyankar 
344878bb4007SShri Abhyankar   PetscFunctionBegin;
34493649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
345078bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
345178bb4007SShri Abhyankar   t  = a->solve_work;
345278bb4007SShri Abhyankar 
345378bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
345478bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
345578bb4007SShri Abhyankar 
345678bb4007SShri Abhyankar   /* forward solve the lower triangular */
345778bb4007SShri Abhyankar   idx    = 5*r[0];
345878bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
345978bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
346078bb4007SShri Abhyankar   for (i=1; i<n; i++) {
346178bb4007SShri Abhyankar     v     = aa + 25*ai[i];
346278bb4007SShri Abhyankar     vi    = aj + ai[i];
346378bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
346478bb4007SShri Abhyankar     idx   = 5*r[i];
346578bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
346678bb4007SShri Abhyankar     s5  = b[4+idx];
346778bb4007SShri Abhyankar     for(m=0;m<nz;m++){
346878bb4007SShri Abhyankar       idx   = 5*vi[m];
346978bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
347078bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
347178bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
347278bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
347378bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
347478bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
347578bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
347678bb4007SShri Abhyankar       v += 25;
347778bb4007SShri Abhyankar     }
347878bb4007SShri Abhyankar     idx = 5*i;
347978bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
348078bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
348178bb4007SShri Abhyankar   }
348278bb4007SShri Abhyankar   /* backward solve the upper triangular */
348378bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
348478bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
348578bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
348678bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
348778bb4007SShri Abhyankar     idt  = 5*i;
348878bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
348978bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
349078bb4007SShri Abhyankar     for(m=0;m<nz;m++){
349178bb4007SShri Abhyankar       idx   = 5*vi[m];
349278bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
349378bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
349478bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
349578bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
349678bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
349778bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
349878bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
349978bb4007SShri Abhyankar       v += 25;
350078bb4007SShri Abhyankar     }
350178bb4007SShri Abhyankar     idc = 5*c[i];
350278bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
350378bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
350478bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
350578bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
350678bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
350778bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
350878bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
350978bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
351078bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
351178bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
351278bb4007SShri Abhyankar   }
351378bb4007SShri Abhyankar 
351478bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
351578bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
35163649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
351778bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
351878bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
351978bb4007SShri Abhyankar   PetscFunctionReturn(0);
352078bb4007SShri Abhyankar }
352178bb4007SShri Abhyankar 
35228f690400SShri Abhyankar #undef __FUNCT__
352306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
352406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
352515091d37SBarry Smith {
352615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3527b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3528b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3529dfbe8321SBarry Smith   PetscErrorCode    ierr;
3530d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3531d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3532d9fead3dSBarry Smith   const PetscScalar *b;
353315091d37SBarry Smith 
353415091d37SBarry Smith   PetscFunctionBegin;
35353649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
35361ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
353715091d37SBarry Smith   /* forward solve the lower triangular */
353815091d37SBarry Smith   idx    = 0;
353915091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
354015091d37SBarry Smith   for (i=1; i<n; i++) {
354115091d37SBarry Smith     v     =  aa + 25*ai[i];
354215091d37SBarry Smith     vi    =  aj + ai[i];
354315091d37SBarry Smith     nz    =  diag[i] - ai[i];
354415091d37SBarry Smith     idx   =  5*i;
3545f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
354615091d37SBarry Smith     while (nz--) {
354715091d37SBarry Smith       jdx   = 5*(*vi++);
354815091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3549f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3550f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3551f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3552f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3553f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
355415091d37SBarry Smith       v    += 25;
355515091d37SBarry Smith     }
3556f1af5d2fSBarry Smith     x[idx]   = s1;
3557f1af5d2fSBarry Smith     x[1+idx] = s2;
3558f1af5d2fSBarry Smith     x[2+idx] = s3;
3559f1af5d2fSBarry Smith     x[3+idx] = s4;
3560f1af5d2fSBarry Smith     x[4+idx] = s5;
356115091d37SBarry Smith   }
356215091d37SBarry Smith   /* backward solve the upper triangular */
356315091d37SBarry Smith   for (i=n-1; i>=0; i--){
356415091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
356515091d37SBarry Smith     vi   = aj + diag[i] + 1;
356615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
356715091d37SBarry Smith     idt  = 5*i;
3568f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3569f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
357015091d37SBarry Smith     while (nz--) {
357115091d37SBarry Smith       idx   = 5*(*vi++);
357215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3573f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3574f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3575f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3576f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3577f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
357815091d37SBarry Smith       v    += 25;
357915091d37SBarry Smith     }
358015091d37SBarry Smith     v        = aa + 25*diag[i];
3581f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3582f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3583f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3584f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3585f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
358615091d37SBarry Smith   }
358715091d37SBarry Smith 
35883649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
35891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3590dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
359115091d37SBarry Smith   PetscFunctionReturn(0);
359215091d37SBarry Smith }
359315091d37SBarry Smith 
3594cee9d6f2SShri Abhyankar #undef __FUNCT__
35954dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
35964dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
359753cca76cSShri Abhyankar {
359853cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3599b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3600b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,idt,jdx;
360153cca76cSShri Abhyankar   PetscErrorCode    ierr;
360253cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
360353cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
360453cca76cSShri Abhyankar   const PetscScalar *b;
360553cca76cSShri Abhyankar 
360653cca76cSShri Abhyankar   PetscFunctionBegin;
36073649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
360853cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
360953cca76cSShri Abhyankar   /* forward solve the lower triangular */
361053cca76cSShri Abhyankar   idx    = 0;
361153cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
361253cca76cSShri Abhyankar   for (i=1; i<n; i++) {
361353cca76cSShri Abhyankar     v   = aa + 25*ai[i];
361453cca76cSShri Abhyankar     vi  = aj + ai[i];
361553cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
361653cca76cSShri Abhyankar     idx = 5*i;
361753cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
361853cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
361953cca76cSShri Abhyankar       jdx   = 5*vi[k];
362053cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
362153cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
362253cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
362353cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
362453cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
362553cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
362653cca76cSShri Abhyankar       v    += 25;
362753cca76cSShri Abhyankar     }
362853cca76cSShri Abhyankar     x[idx]   = s1;
362953cca76cSShri Abhyankar     x[1+idx] = s2;
363053cca76cSShri Abhyankar     x[2+idx] = s3;
363153cca76cSShri Abhyankar     x[3+idx] = s4;
363253cca76cSShri Abhyankar     x[4+idx] = s5;
363353cca76cSShri Abhyankar   }
363453cca76cSShri Abhyankar 
363553cca76cSShri Abhyankar   /* backward solve the upper triangular */
363653cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
363753cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
363853cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
363953cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
364053cca76cSShri Abhyankar     idt = 5*i;
364153cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
364253cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
364353cca76cSShri Abhyankar     for(k=0;k<nz;k++){
364453cca76cSShri Abhyankar       idx   = 5*vi[k];
364553cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
364653cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
364753cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
364853cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
364953cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
365053cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
365153cca76cSShri Abhyankar       v    += 25;
365253cca76cSShri Abhyankar     }
365353cca76cSShri Abhyankar     /* x = inv_diagonal*x */
365453cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
365553cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
365653cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
365753cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
365853cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
365953cca76cSShri Abhyankar   }
366053cca76cSShri Abhyankar 
36613649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
366253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
366353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
366453cca76cSShri Abhyankar   PetscFunctionReturn(0);
366553cca76cSShri Abhyankar }
366653cca76cSShri Abhyankar 
366753cca76cSShri Abhyankar #undef __FUNCT__
366806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
366906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
36704e2b4712SSatish Balay {
36714e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
36724e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
36736849ba73SBarry Smith   PetscErrorCode    ierr;
3674b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3675b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
36765d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3677d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3678d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3679d9fead3dSBarry Smith   const PetscScalar *b;
36804e2b4712SSatish Balay 
36814e2b4712SSatish Balay   PetscFunctionBegin;
36823649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
36831ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3684f1af5d2fSBarry Smith   t  = a->solve_work;
36854e2b4712SSatish Balay 
36864e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
36874e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
36884e2b4712SSatish Balay 
36894e2b4712SSatish Balay   /* forward solve the lower triangular */
36904e2b4712SSatish Balay   idx    = 4*(*r++);
3691f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3692f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
36934e2b4712SSatish Balay   for (i=1; i<n; i++) {
36944e2b4712SSatish Balay     v     = aa + 16*ai[i];
36954e2b4712SSatish Balay     vi    = aj + ai[i];
36964e2b4712SSatish Balay     nz    = diag[i] - ai[i];
36974e2b4712SSatish Balay     idx   = 4*(*r++);
3698f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
36994e2b4712SSatish Balay     while (nz--) {
37004e2b4712SSatish Balay       idx   = 4*(*vi++);
3701f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3702f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3703f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3704f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3705f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
37064e2b4712SSatish Balay       v    += 16;
37074e2b4712SSatish Balay     }
37084e2b4712SSatish Balay     idx        = 4*i;
3709f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3710f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
37114e2b4712SSatish Balay   }
37124e2b4712SSatish Balay   /* backward solve the upper triangular */
37134e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
37144e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
37154e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
37164e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
37174e2b4712SSatish Balay     idt  = 4*i;
3718f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3719f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
37204e2b4712SSatish Balay     while (nz--) {
37214e2b4712SSatish Balay       idx   = 4*(*vi++);
3722f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3723f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3724f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3725f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3726f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3727f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
37284e2b4712SSatish Balay       v += 16;
37294e2b4712SSatish Balay     }
37304e2b4712SSatish Balay     idc      = 4*(*c--);
37314e2b4712SSatish Balay     v        = aa + 16*diag[i];
3732f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3733f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3734f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3735f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
37364e2b4712SSatish Balay   }
37374e2b4712SSatish Balay 
37384e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
37394e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
37403649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
37411ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3742dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
37434e2b4712SSatish Balay   PetscFunctionReturn(0);
37444e2b4712SSatish Balay }
3745f26ec98cSKris Buschelman 
37468f690400SShri Abhyankar #undef __FUNCT__
37474dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4"
37484dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
374978bb4007SShri Abhyankar {
375078bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
375178bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
375278bb4007SShri Abhyankar   PetscErrorCode    ierr;
3753b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3754b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
375578bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
375678bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
375778bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
375878bb4007SShri Abhyankar   const PetscScalar *b;
375978bb4007SShri Abhyankar 
376078bb4007SShri Abhyankar   PetscFunctionBegin;
37613649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
376278bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
376378bb4007SShri Abhyankar   t  = a->solve_work;
376478bb4007SShri Abhyankar 
376578bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
376678bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
376778bb4007SShri Abhyankar 
376878bb4007SShri Abhyankar   /* forward solve the lower triangular */
376978bb4007SShri Abhyankar   idx    = 4*r[0];
377078bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
377178bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
377278bb4007SShri Abhyankar   for (i=1; i<n; i++) {
377378bb4007SShri Abhyankar     v     = aa + 16*ai[i];
377478bb4007SShri Abhyankar     vi    = aj + ai[i];
377578bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
377678bb4007SShri Abhyankar     idx   = 4*r[i];
377778bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
377878bb4007SShri Abhyankar     for(m=0;m<nz;m++){
377978bb4007SShri Abhyankar       idx   = 4*vi[m];
378078bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
378178bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
378278bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
378378bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
378478bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
378578bb4007SShri Abhyankar       v    += 16;
378678bb4007SShri Abhyankar     }
378778bb4007SShri Abhyankar     idx        = 4*i;
378878bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
378978bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
379078bb4007SShri Abhyankar   }
379178bb4007SShri Abhyankar   /* backward solve the upper triangular */
379278bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
379378bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
379478bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
379578bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
379678bb4007SShri Abhyankar     idt  = 4*i;
379778bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
379878bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
379978bb4007SShri Abhyankar     for(m=0;m<nz;m++){
380078bb4007SShri Abhyankar       idx   = 4*vi[m];
380178bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
380278bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
380378bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
380478bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
380578bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
380678bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
380778bb4007SShri Abhyankar       v += 16;
380878bb4007SShri Abhyankar     }
380978bb4007SShri Abhyankar     idc      = 4*c[i];
381078bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
381178bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
381278bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
381378bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
381478bb4007SShri Abhyankar   }
381578bb4007SShri Abhyankar 
381678bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
381778bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
38183649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
381978bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382078bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
382178bb4007SShri Abhyankar   PetscFunctionReturn(0);
382278bb4007SShri Abhyankar }
382378bb4007SShri Abhyankar 
382478bb4007SShri Abhyankar #undef __FUNCT__
3825f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3826dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3827f26ec98cSKris Buschelman {
3828f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3829f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
38306849ba73SBarry Smith   PetscErrorCode    ierr;
3831b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3832b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
38335d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3834d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3835d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3836d9fead3dSBarry Smith   PetscScalar       *x;
3837d9fead3dSBarry Smith   const PetscScalar *b;
3838f26ec98cSKris Buschelman 
3839f26ec98cSKris Buschelman   PetscFunctionBegin;
38403649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
38411ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3842f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
3843f26ec98cSKris Buschelman 
3844f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3845f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3846f26ec98cSKris Buschelman 
3847f26ec98cSKris Buschelman   /* forward solve the lower triangular */
3848f26ec98cSKris Buschelman   idx    = 4*(*r++);
3849f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
3850f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
3851f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
3852f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
3853f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
3854f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
3855f26ec98cSKris Buschelman     vi    = aj + ai[i];
3856f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
3857f26ec98cSKris Buschelman     idx   = 4*(*r++);
3858f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
3859f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
3860f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
3861f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
3862f26ec98cSKris Buschelman     while (nz--) {
3863f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3864f26ec98cSKris Buschelman       x1  = t[idx];
3865f26ec98cSKris Buschelman       x2  = t[1+idx];
3866f26ec98cSKris Buschelman       x3  = t[2+idx];
3867f26ec98cSKris Buschelman       x4  = t[3+idx];
3868f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3869f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3870f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3871f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3872f26ec98cSKris Buschelman       v    += 16;
3873f26ec98cSKris Buschelman     }
3874f26ec98cSKris Buschelman     idx        = 4*i;
3875f26ec98cSKris Buschelman     t[idx]   = s1;
3876f26ec98cSKris Buschelman     t[1+idx] = s2;
3877f26ec98cSKris Buschelman     t[2+idx] = s3;
3878f26ec98cSKris Buschelman     t[3+idx] = s4;
3879f26ec98cSKris Buschelman   }
3880f26ec98cSKris Buschelman   /* backward solve the upper triangular */
3881f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
3882f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
3883f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
3884f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
3885f26ec98cSKris Buschelman     idt  = 4*i;
3886f26ec98cSKris Buschelman     s1 = t[idt];
3887f26ec98cSKris Buschelman     s2 = t[1+idt];
3888f26ec98cSKris Buschelman     s3 = t[2+idt];
3889f26ec98cSKris Buschelman     s4 = t[3+idt];
3890f26ec98cSKris Buschelman     while (nz--) {
3891f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3892f26ec98cSKris Buschelman       x1  = t[idx];
3893f26ec98cSKris Buschelman       x2  = t[1+idx];
3894f26ec98cSKris Buschelman       x3  = t[2+idx];
3895f26ec98cSKris Buschelman       x4  = t[3+idx];
3896f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3897f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3898f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3899f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3900f26ec98cSKris Buschelman       v += 16;
3901f26ec98cSKris Buschelman     }
3902f26ec98cSKris Buschelman     idc      = 4*(*c--);
3903f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3904f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3905f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3906f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3907f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3908f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3909f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3910f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3911f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3912f26ec98cSKris Buschelman  }
3913f26ec98cSKris Buschelman 
3914f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3915f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
39163649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
39171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3918dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3919f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3920f26ec98cSKris Buschelman }
3921f26ec98cSKris Buschelman 
392224c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
392324c233c2SKris Buschelman 
392424c233c2SKris Buschelman #include PETSC_HAVE_SSE
392524c233c2SKris Buschelman 
392624c233c2SKris Buschelman #undef __FUNCT__
392724c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3928dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
392924c233c2SKris Buschelman {
393024c233c2SKris Buschelman   /*
393124c233c2SKris Buschelman      Note: This code uses demotion of double
393224c233c2SKris Buschelman      to float when performing the mixed-mode computation.
393324c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
393424c233c2SKris Buschelman   */
393524c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
393624c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
39376849ba73SBarry Smith   PetscErrorCode ierr;
39385d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
39395d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
394024c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
394187828ca2SBarry Smith   PetscScalar    *x,*b,*t;
394224c233c2SKris Buschelman 
394324c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
394424c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
394524c233c2SKris Buschelman   unsigned long   offset;
394624c233c2SKris Buschelman 
394724c233c2SKris Buschelman   PetscFunctionBegin;
394824c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
394924c233c2SKris Buschelman 
395024c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
395124c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
395224c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
395324c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
395424c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
395524c233c2SKris Buschelman 
39561ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
39571ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
395824c233c2SKris Buschelman     t  = a->solve_work;
395924c233c2SKris Buschelman 
396024c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
396124c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
396224c233c2SKris Buschelman 
396324c233c2SKris Buschelman     /* forward solve the lower triangular */
396424c233c2SKris Buschelman     idx  = 4*(*r++);
396524c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
396624c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
396724c233c2SKris Buschelman     v    =  aa + 16*ai[1];
396824c233c2SKris Buschelman 
396924c233c2SKris Buschelman     for (i=1; i<n;) {
397024c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
397124c233c2SKris Buschelman       vi   =  aj      + ai[i];
397224c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
397324c233c2SKris Buschelman       idx  =  4*(*r++);
397424c233c2SKris Buschelman 
397524c233c2SKris Buschelman       /* Demote sum from double to float */
397624c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
397724c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
397824c233c2SKris Buschelman 
397924c233c2SKris Buschelman       while (nz--) {
398024c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
398124c233c2SKris Buschelman         idx = 4*(*vi++);
398224c233c2SKris Buschelman 
398324c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
398424c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
398524c233c2SKris Buschelman 
398624c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
398724c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
398824c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
398924c233c2SKris Buschelman 
399024c233c2SKris Buschelman           /* First Column */
399124c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
399224c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
399324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
399424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
399524c233c2SKris Buschelman 
399624c233c2SKris Buschelman           /* Second Column */
399724c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
399824c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
399924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
400024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
400124c233c2SKris Buschelman 
400224c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
400324c233c2SKris Buschelman 
400424c233c2SKris Buschelman           /* Third Column */
400524c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
400624c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
400724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
400824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
400924c233c2SKris Buschelman 
401024c233c2SKris Buschelman           /* Fourth Column */
401124c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
401224c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
401324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
401424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
401524c233c2SKris Buschelman         SSE_INLINE_END_2
401624c233c2SKris Buschelman 
401724c233c2SKris Buschelman         v  += 16;
401824c233c2SKris Buschelman       }
401924c233c2SKris Buschelman       idx = 4*i;
402024c233c2SKris Buschelman       v   = aa + 16*ai[++i];
402124c233c2SKris Buschelman       PREFETCH_NTA(v);
402224c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
402324c233c2SKris Buschelman 
402424c233c2SKris Buschelman       /* Promote result from float to double */
402524c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
402624c233c2SKris Buschelman     }
402724c233c2SKris Buschelman     /* backward solve the upper triangular */
402824c233c2SKris Buschelman     idt  = 4*(n-1);
402924c233c2SKris Buschelman     ai16 = 16*diag[n-1];
403024c233c2SKris Buschelman     v    = aa + ai16 + 16;
403124c233c2SKris Buschelman     for (i=n-1; i>=0;){
403224c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
403324c233c2SKris Buschelman       vi = aj + diag[i] + 1;
403424c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
403524c233c2SKris Buschelman 
403624c233c2SKris Buschelman       /* Demote accumulator from double to float */
403724c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
403824c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
403924c233c2SKris Buschelman 
404024c233c2SKris Buschelman       while (nz--) {
404124c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
404224c233c2SKris Buschelman         idx = 4*(*vi++);
404324c233c2SKris Buschelman 
404424c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
404524c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
404624c233c2SKris Buschelman 
404724c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
404824c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
404924c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
405024c233c2SKris Buschelman 
405124c233c2SKris Buschelman           /* First Column */
405224c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
405324c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
405424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
405524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
405624c233c2SKris Buschelman 
405724c233c2SKris Buschelman           /* Second Column */
405824c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
405924c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
406024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
406124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
406224c233c2SKris Buschelman 
406324c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
406424c233c2SKris Buschelman 
406524c233c2SKris Buschelman           /* Third Column */
406624c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
406724c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
406824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
406924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
407024c233c2SKris Buschelman 
407124c233c2SKris Buschelman           /* Fourth Column */
407224c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
407324c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
407424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
407524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
407624c233c2SKris Buschelman         SSE_INLINE_END_2
407724c233c2SKris Buschelman         v  += 16;
407824c233c2SKris Buschelman       }
407924c233c2SKris Buschelman       v    = aa + ai16;
408024c233c2SKris Buschelman       ai16 = 16*diag[--i];
408124c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
408224c233c2SKris Buschelman       /*
408324c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
408424c233c2SKris Buschelman          which was inverted as part of the factorization
408524c233c2SKris Buschelman       */
408624c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
408724c233c2SKris Buschelman         /* First Column */
408824c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
408924c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
409024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
409124c233c2SKris Buschelman 
409224c233c2SKris Buschelman         /* Second Column */
409324c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
409424c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
409524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
409624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
409724c233c2SKris Buschelman 
409824c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
409924c233c2SKris Buschelman 
410024c233c2SKris Buschelman         /* Third Column */
410124c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
410224c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
410324c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
410424c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
410524c233c2SKris Buschelman 
410624c233c2SKris Buschelman         /* Fourth Column */
410724c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
410824c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
410924c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
411024c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
411124c233c2SKris Buschelman 
411224c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
411324c233c2SKris Buschelman       SSE_INLINE_END_3
411424c233c2SKris Buschelman 
411524c233c2SKris Buschelman       /* Promote solution from float to double */
411624c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
411724c233c2SKris Buschelman 
411824c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
411924c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
412024c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
412124c233c2SKris Buschelman       idc  = 4*(*c--);
412224c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
412324c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
412424c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
412524c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
412624c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
412724c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
412824c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
412924c233c2SKris Buschelman       SSE_INLINE_END_2
413024c233c2SKris Buschelman       v    = aa + ai16 + 16;
413124c233c2SKris Buschelman       idt -= 4;
413224c233c2SKris Buschelman     }
413324c233c2SKris Buschelman 
413424c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
413524c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
41361ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
41371ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4138dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
413924c233c2SKris Buschelman   SSE_SCOPE_END;
414024c233c2SKris Buschelman   PetscFunctionReturn(0);
414124c233c2SKris Buschelman }
414224c233c2SKris Buschelman 
414324c233c2SKris Buschelman #endif
41440ef38995SBarry Smith 
41450ef38995SBarry Smith 
41464e2b4712SSatish Balay /*
41474e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
41484e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
41494e2b4712SSatish Balay */
41504a2ae208SSatish Balay #undef __FUNCT__
415106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
415206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
41534e2b4712SSatish Balay {
41544e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4155356650c2SBarry Smith   PetscInt          n=a->mbs;
4156356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
4157dfbe8321SBarry Smith   PetscErrorCode    ierr;
4158356650c2SBarry Smith   const PetscInt    *diag = a->diag;
4159d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
4160d9fead3dSBarry Smith   PetscScalar       *x;
4161d9fead3dSBarry Smith   const PetscScalar *b;
41624e2b4712SSatish Balay 
41634e2b4712SSatish Balay   PetscFunctionBegin;
41643649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
41651ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
41664e2b4712SSatish Balay 
4167aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
41682853dc0eSBarry Smith   {
416987828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41702853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
41712853dc0eSBarry Smith   }
4172aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
41732853dc0eSBarry Smith   {
417487828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41752853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
41762853dc0eSBarry Smith   }
4177aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
41782853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4179e1293385SBarry Smith #else
418030d4dcafSBarry Smith   {
418187828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4182d9fead3dSBarry Smith     const MatScalar *v;
4183356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
4184356650c2SBarry Smith     const PetscInt  *vi;
4185e1293385SBarry Smith 
41864e2b4712SSatish Balay   /* forward solve the lower triangular */
41874e2b4712SSatish Balay   idx    = 0;
4188e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
41894e2b4712SSatish Balay   for (i=1; i<n; i++) {
41904e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
41914e2b4712SSatish Balay     vi    =  aj      + ai[i];
41924e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
4193e1293385SBarry Smith     idx   +=  4;
4194f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
41954e2b4712SSatish Balay     while (nz--) {
41964e2b4712SSatish Balay       jdx   = 4*(*vi++);
41974e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4198f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4199f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4200f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4201f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
42024e2b4712SSatish Balay       v    += 16;
42034e2b4712SSatish Balay     }
4204f1af5d2fSBarry Smith     x[idx]   = s1;
4205f1af5d2fSBarry Smith     x[1+idx] = s2;
4206f1af5d2fSBarry Smith     x[2+idx] = s3;
4207f1af5d2fSBarry Smith     x[3+idx] = s4;
42084e2b4712SSatish Balay   }
42094e2b4712SSatish Balay   /* backward solve the upper triangular */
42104e555682SBarry Smith   idt = 4*(n-1);
42114e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
42124e555682SBarry Smith     ai16 = 16*diag[i];
42134e555682SBarry Smith     v    = aa + ai16 + 16;
42144e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
42154e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4216f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4217f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
42184e2b4712SSatish Balay     while (nz--) {
42194e2b4712SSatish Balay       idx   = 4*(*vi++);
42204e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4221f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4222f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4223f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4224f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
42254e2b4712SSatish Balay       v    += 16;
42264e2b4712SSatish Balay     }
42274e555682SBarry Smith     v        = aa + ai16;
4228f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4229f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4230f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4231f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4232329f5518SBarry Smith     idt -= 4;
42334e2b4712SSatish Balay   }
423430d4dcafSBarry Smith   }
4235e1293385SBarry Smith #endif
42364e2b4712SSatish Balay 
42373649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
42381ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4239dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
42404e2b4712SSatish Balay   PetscFunctionReturn(0);
42414e2b4712SSatish Balay }
42424e2b4712SSatish Balay 
4243b2b2dd24SShri Abhyankar #undef __FUNCT__
42444dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
42454dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4246b2b2dd24SShri Abhyankar {
4247b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4248b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4249b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
4250b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4251b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4252b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4253b2b2dd24SShri Abhyankar     PetscScalar       *x;
4254b2b2dd24SShri Abhyankar     const PetscScalar *b;
4255b2b2dd24SShri Abhyankar     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4256cee9d6f2SShri Abhyankar 
4257b2b2dd24SShri Abhyankar     PetscFunctionBegin;
42583649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4259b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4260b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4261b2b2dd24SShri Abhyankar     idx    = 0;
4262b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4263b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4264b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4265b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4266b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4267b2b2dd24SShri Abhyankar       idx   = bs*i;
4268b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4269b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
4270b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
4271b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4272b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4273b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4274b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4275b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4276b2b2dd24SShri Abhyankar 
4277b2b2dd24SShri Abhyankar           v   +=  bs2;
4278b2b2dd24SShri Abhyankar         }
4279b2b2dd24SShri Abhyankar 
4280b2b2dd24SShri Abhyankar        x[idx]   = s1;
4281b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4282b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4283b2b2dd24SShri Abhyankar        x[3+idx] = s4;
4284b2b2dd24SShri Abhyankar     }
4285b2b2dd24SShri Abhyankar 
4286b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4287b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4288b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4289b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4290b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4291b2b2dd24SShri Abhyankar      idt = bs*i;
4292b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4293b2b2dd24SShri Abhyankar 
4294b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
4295b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
4296b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4297b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4298b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4299b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4300b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4301b2b2dd24SShri Abhyankar 
4302b2b2dd24SShri Abhyankar         v   +=  bs2;
4303b2b2dd24SShri Abhyankar     }
4304b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4305b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4306b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4307b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4308b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4309b2b2dd24SShri Abhyankar 
4310b2b2dd24SShri Abhyankar   }
4311b2b2dd24SShri Abhyankar 
43123649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4313b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4314b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4315b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4316b2b2dd24SShri Abhyankar }
4317cee9d6f2SShri Abhyankar 
4318cee9d6f2SShri Abhyankar #undef __FUNCT__
4319f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4320dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4321f26ec98cSKris Buschelman {
4322f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4323b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4324dfbe8321SBarry Smith   PetscErrorCode    ierr;
4325b3260449SShri Abhyankar   const MatScalar   *aa=a->a;
4326b3260449SShri Abhyankar   const PetscScalar *b;
4327b3260449SShri Abhyankar   PetscScalar       *x;
4328f26ec98cSKris Buschelman 
4329f26ec98cSKris Buschelman   PetscFunctionBegin;
43303649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
43311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4332f26ec98cSKris Buschelman 
4333f26ec98cSKris Buschelman   {
4334f26ec98cSKris Buschelman     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4335b3260449SShri Abhyankar     const MatScalar  *v;
4336b3260449SShri Abhyankar     MatScalar        *t=(MatScalar *)x;
4337b3260449SShri Abhyankar     PetscInt         jdx,idt,idx,nz,i,ai16;
4338b3260449SShri Abhyankar     const PetscInt   *vi;
4339f26ec98cSKris Buschelman 
4340f26ec98cSKris Buschelman     /* forward solve the lower triangular */
4341f26ec98cSKris Buschelman     idx  = 0;
4342f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
4343f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
4344f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
4345f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
4346f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
4347f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
4348f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
4349f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
4350f26ec98cSKris Buschelman       idx   +=  4;
4351f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
4352f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
4353f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
4354f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
4355f26ec98cSKris Buschelman       while (nz--) {
4356f26ec98cSKris Buschelman         jdx = 4*(*vi++);
4357f26ec98cSKris Buschelman         x1  = t[jdx];
4358f26ec98cSKris Buschelman         x2  = t[1+jdx];
4359f26ec98cSKris Buschelman         x3  = t[2+jdx];
4360f26ec98cSKris Buschelman         x4  = t[3+jdx];
4361f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4362f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4363f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4364f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4365f26ec98cSKris Buschelman         v    += 16;
4366f26ec98cSKris Buschelman       }
4367f26ec98cSKris Buschelman       t[idx]   = s1;
4368f26ec98cSKris Buschelman       t[1+idx] = s2;
4369f26ec98cSKris Buschelman       t[2+idx] = s3;
4370f26ec98cSKris Buschelman       t[3+idx] = s4;
4371f26ec98cSKris Buschelman     }
4372f26ec98cSKris Buschelman     /* backward solve the upper triangular */
4373f26ec98cSKris Buschelman     idt = 4*(n-1);
4374f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
4375f26ec98cSKris Buschelman       ai16 = 16*diag[i];
4376f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
4377f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
4378f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
4379f26ec98cSKris Buschelman       s1   = t[idt];
4380f26ec98cSKris Buschelman       s2   = t[1+idt];
4381f26ec98cSKris Buschelman       s3   = t[2+idt];
4382f26ec98cSKris Buschelman       s4   = t[3+idt];
4383f26ec98cSKris Buschelman       while (nz--) {
4384f26ec98cSKris Buschelman         idx = 4*(*vi++);
4385f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
4386f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
4387f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
4388f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
4389f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4390f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4391f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4392f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4393f26ec98cSKris Buschelman         v    += 16;
4394f26ec98cSKris Buschelman       }
4395f26ec98cSKris Buschelman       v        = aa + ai16;
4396f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4397f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4398f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4399f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4400f26ec98cSKris Buschelman       idt -= 4;
4401f26ec98cSKris Buschelman     }
4402f26ec98cSKris Buschelman   }
4403f26ec98cSKris Buschelman 
44043649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
44051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4406dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4407f26ec98cSKris Buschelman   PetscFunctionReturn(0);
4408f26ec98cSKris Buschelman }
4409f26ec98cSKris Buschelman 
44103660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
44113660e330SKris Buschelman 
44123660e330SKris Buschelman #include PETSC_HAVE_SSE
44133660e330SKris Buschelman #undef __FUNCT__
44147cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4415dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
44163660e330SKris Buschelman {
44173660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
44182aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
4419dfbe8321SBarry Smith   PetscErrorCode ierr;
4420dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
44213660e330SKris Buschelman   MatScalar      *aa=a->a;
442287828ca2SBarry Smith   PetscScalar    *x,*b;
44233660e330SKris Buschelman 
44243660e330SKris Buschelman   PetscFunctionBegin;
44253660e330SKris Buschelman   SSE_SCOPE_BEGIN;
44263660e330SKris Buschelman   /*
44273660e330SKris Buschelman      Note: This code currently uses demotion of double
44283660e330SKris Buschelman      to float when performing the mixed-mode computation.
44293660e330SKris Buschelman      This may not be numerically reasonable for all applications.
44303660e330SKris Buschelman   */
44313660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
44323660e330SKris Buschelman 
44331ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
44341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
44353660e330SKris Buschelman   {
4436eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
4437eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
44382aa5897fSKris Buschelman     int            nz,i,idt,ai16;
44392aa5897fSKris Buschelman     unsigned int   jdx,idx;
44402aa5897fSKris Buschelman     unsigned short *vi;
4441eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
44423660e330SKris Buschelman 
4443eb05f457SKris Buschelman     /* First block is the identity. */
44443660e330SKris Buschelman     idx  = 0;
4445eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
44462aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
44473660e330SKris Buschelman 
44483660e330SKris Buschelman     for (i=1; i<n;) {
44493660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
44503660e330SKris Buschelman       vi   =  aj      + ai[i];
44513660e330SKris Buschelman       nz   =  diag[i] - ai[i];
44523660e330SKris Buschelman       idx +=  4;
44533660e330SKris Buschelman 
4454eb05f457SKris Buschelman       /* Demote RHS from double to float. */
4455eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4456eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
44573660e330SKris Buschelman 
44583660e330SKris Buschelman       while (nz--) {
44593660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
44602aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
44613660e330SKris Buschelman 
44623660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
4463eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
44643660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
44653660e330SKris Buschelman 
44663660e330SKris Buschelman           /* First Column */
44673660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
44683660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
44693660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
44703660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
44713660e330SKris Buschelman 
44723660e330SKris Buschelman           /* Second Column */
44733660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
44743660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
44753660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
44763660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
44773660e330SKris Buschelman 
44783660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
44793660e330SKris Buschelman 
44803660e330SKris Buschelman           /* Third Column */
44813660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
44823660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
44833660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
44843660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
44853660e330SKris Buschelman 
44863660e330SKris Buschelman           /* Fourth Column */
44873660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
44883660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
44893660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
44903660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
44913660e330SKris Buschelman         SSE_INLINE_END_2
44923660e330SKris Buschelman 
44933660e330SKris Buschelman         v  += 16;
44943660e330SKris Buschelman       }
44953660e330SKris Buschelman       v    =  aa + 16*ai[++i];
44963660e330SKris Buschelman       PREFETCH_NTA(v);
4497eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
44983660e330SKris Buschelman     }
4499eb05f457SKris Buschelman 
4500eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
4501eb05f457SKris Buschelman 
45023660e330SKris Buschelman     idt  = 4*(n-1);
45033660e330SKris Buschelman     ai16 = 16*diag[n-1];
45043660e330SKris Buschelman     v    = aa + ai16 + 16;
45053660e330SKris Buschelman     for (i=n-1; i>=0;){
45063660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
45073660e330SKris Buschelman       vi = aj + diag[i] + 1;
45083660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
45093660e330SKris Buschelman 
4510eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
45113660e330SKris Buschelman 
45123660e330SKris Buschelman       while (nz--) {
45133660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
45142aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
45153660e330SKris Buschelman 
45163660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
4517eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
45183660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
45193660e330SKris Buschelman 
45203660e330SKris Buschelman           /* First Column */
45213660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
45223660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
45233660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
45243660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
45253660e330SKris Buschelman 
45263660e330SKris Buschelman           /* Second Column */
45273660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
45283660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
45293660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
45303660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
45313660e330SKris Buschelman 
45323660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
45333660e330SKris Buschelman 
45343660e330SKris Buschelman           /* Third Column */
45353660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
45363660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
45373660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
45383660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
45393660e330SKris Buschelman 
45403660e330SKris Buschelman           /* Fourth Column */
45413660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
45423660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
45433660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
45443660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
45453660e330SKris Buschelman         SSE_INLINE_END_2
45463660e330SKris Buschelman         v  += 16;
45473660e330SKris Buschelman       }
45483660e330SKris Buschelman       v    = aa + ai16;
45493660e330SKris Buschelman       ai16 = 16*diag[--i];
45503660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
45513660e330SKris Buschelman       /*
45523660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
45533660e330SKris Buschelman          which was inverted as part of the factorization
45543660e330SKris Buschelman       */
4555eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
45563660e330SKris Buschelman         /* First Column */
45573660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
45583660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
45593660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
45603660e330SKris Buschelman 
45613660e330SKris Buschelman         /* Second Column */
45623660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
45633660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
45643660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
45653660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
45663660e330SKris Buschelman 
45673660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
45683660e330SKris Buschelman 
45693660e330SKris Buschelman         /* Third Column */
45703660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
45713660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
45723660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
45733660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
45743660e330SKris Buschelman 
45753660e330SKris Buschelman         /* Fourth Column */
45763660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
45773660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
45783660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
45793660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
45803660e330SKris Buschelman 
45813660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
45823660e330SKris Buschelman       SSE_INLINE_END_3
45833660e330SKris Buschelman 
45843660e330SKris Buschelman       v    = aa + ai16 + 16;
45853660e330SKris Buschelman       idt -= 4;
45863660e330SKris Buschelman     }
4587eb05f457SKris Buschelman 
4588eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
4589eb05f457SKris Buschelman     idt = 4*(n-1);
4590eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
4591eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4592eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4593eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
4594eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
4595eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
4596eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
4597eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
4598eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
459954693613SKris Buschelman       idt -= 4;
46003660e330SKris Buschelman     }
4601eb05f457SKris Buschelman 
4602eb05f457SKris Buschelman   } /* End of artificial scope. */
46031ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
46041ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4605dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
46063660e330SKris Buschelman   SSE_SCOPE_END;
46073660e330SKris Buschelman   PetscFunctionReturn(0);
46083660e330SKris Buschelman }
46093660e330SKris Buschelman 
46107cf1b8d3SKris Buschelman #undef __FUNCT__
46117cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4612dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
46137cf1b8d3SKris Buschelman {
46147cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
46157cf1b8d3SKris Buschelman   int            *aj=a->j;
4616dfbe8321SBarry Smith   PetscErrorCode ierr;
4617dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
46187cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
46197cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
46207cf1b8d3SKris Buschelman 
46217cf1b8d3SKris Buschelman   PetscFunctionBegin;
46227cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
46237cf1b8d3SKris Buschelman   /*
46247cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
46257cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
46267cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
46277cf1b8d3SKris Buschelman   */
46287cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
46297cf1b8d3SKris Buschelman 
46301ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
46311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
46327cf1b8d3SKris Buschelman   {
46337cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
46347cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
46357cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
46367cf1b8d3SKris Buschelman     int       jdx,idx;
46377cf1b8d3SKris Buschelman     int       *vi;
46387cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
46397cf1b8d3SKris Buschelman 
46407cf1b8d3SKris Buschelman     /* First block is the identity. */
46417cf1b8d3SKris Buschelman     idx  = 0;
46427cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
46437cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
46447cf1b8d3SKris Buschelman 
46457cf1b8d3SKris Buschelman     for (i=1; i<n;) {
46467cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
46477cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
46487cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
46497cf1b8d3SKris Buschelman       idx +=  4;
46507cf1b8d3SKris Buschelman 
46517cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
46527cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
46537cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
46547cf1b8d3SKris Buschelman 
46557cf1b8d3SKris Buschelman       while (nz--) {
46567cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
46577cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
46587cf1b8d3SKris Buschelman /*          jdx = *vi++; */
46597cf1b8d3SKris Buschelman 
46607cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
46617cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
46627cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
46637cf1b8d3SKris Buschelman 
46647cf1b8d3SKris Buschelman           /* First Column */
46657cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
46667cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
46677cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
46687cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
46697cf1b8d3SKris Buschelman 
46707cf1b8d3SKris Buschelman           /* Second Column */
46717cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
46727cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
46737cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
46747cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
46757cf1b8d3SKris Buschelman 
46767cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
46777cf1b8d3SKris Buschelman 
46787cf1b8d3SKris Buschelman           /* Third Column */
46797cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
46807cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
46817cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
46827cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
46837cf1b8d3SKris Buschelman 
46847cf1b8d3SKris Buschelman           /* Fourth Column */
46857cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
46867cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
46877cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
46887cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
46897cf1b8d3SKris Buschelman         SSE_INLINE_END_2
46907cf1b8d3SKris Buschelman 
46917cf1b8d3SKris Buschelman         v  += 16;
46927cf1b8d3SKris Buschelman       }
46937cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
46947cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
46957cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
46967cf1b8d3SKris Buschelman     }
46977cf1b8d3SKris Buschelman 
46987cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
46997cf1b8d3SKris Buschelman 
47007cf1b8d3SKris Buschelman     idt  = 4*(n-1);
47017cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
47027cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
47037cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
47047cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
47057cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
47067cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
47077cf1b8d3SKris Buschelman 
47087cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
47097cf1b8d3SKris Buschelman 
47107cf1b8d3SKris Buschelman       while (nz--) {
47117cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
47127cf1b8d3SKris Buschelman         idx = 4*(*vi++);
47137cf1b8d3SKris Buschelman /*          idx = *vi++; */
47147cf1b8d3SKris Buschelman 
47157cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
47167cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
47177cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
47187cf1b8d3SKris Buschelman 
47197cf1b8d3SKris Buschelman           /* First Column */
47207cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
47217cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
47227cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
47237cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
47247cf1b8d3SKris Buschelman 
47257cf1b8d3SKris Buschelman           /* Second Column */
47267cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
47277cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
47287cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
47297cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
47307cf1b8d3SKris Buschelman 
47317cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
47327cf1b8d3SKris Buschelman 
47337cf1b8d3SKris Buschelman           /* Third Column */
47347cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
47357cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
47367cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
47377cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
47387cf1b8d3SKris Buschelman 
47397cf1b8d3SKris Buschelman           /* Fourth Column */
47407cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
47417cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
47427cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
47437cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
47447cf1b8d3SKris Buschelman         SSE_INLINE_END_2
47457cf1b8d3SKris Buschelman         v  += 16;
47467cf1b8d3SKris Buschelman       }
47477cf1b8d3SKris Buschelman       v    = aa + ai16;
47487cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
47497cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
47507cf1b8d3SKris Buschelman       /*
47517cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
47527cf1b8d3SKris Buschelman          which was inverted as part of the factorization
47537cf1b8d3SKris Buschelman       */
47547cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
47557cf1b8d3SKris Buschelman         /* First Column */
47567cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
47577cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
47587cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
47597cf1b8d3SKris Buschelman 
47607cf1b8d3SKris Buschelman         /* Second Column */
47617cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
47627cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
47637cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
47647cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
47657cf1b8d3SKris Buschelman 
47667cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
47677cf1b8d3SKris Buschelman 
47687cf1b8d3SKris Buschelman         /* Third Column */
47697cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
47707cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
47717cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
47727cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
47737cf1b8d3SKris Buschelman 
47747cf1b8d3SKris Buschelman         /* Fourth Column */
47757cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
47767cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
47777cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
47787cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
47797cf1b8d3SKris Buschelman 
47807cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
47817cf1b8d3SKris Buschelman       SSE_INLINE_END_3
47827cf1b8d3SKris Buschelman 
47837cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
47847cf1b8d3SKris Buschelman       idt -= 4;
47857cf1b8d3SKris Buschelman     }
47867cf1b8d3SKris Buschelman 
47877cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
47887cf1b8d3SKris Buschelman     idt = 4*(n-1);
47897cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
47907cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
47917cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
47927cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
47937cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
47947cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
47957cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
47967cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
47977cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
47987cf1b8d3SKris Buschelman       idt -= 4;
47997cf1b8d3SKris Buschelman     }
48007cf1b8d3SKris Buschelman 
48017cf1b8d3SKris Buschelman   } /* End of artificial scope. */
48021ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
48031ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4804dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
48057cf1b8d3SKris Buschelman   SSE_SCOPE_END;
48067cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
48077cf1b8d3SKris Buschelman }
48087cf1b8d3SKris Buschelman 
48093660e330SKris Buschelman #endif
48108f690400SShri Abhyankar 
48114a2ae208SSatish Balay #undef __FUNCT__
481206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
481306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
48144e2b4712SSatish Balay {
48154e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
48164e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
48176849ba73SBarry Smith   PetscErrorCode    ierr;
4818b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4819b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
48205d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4821d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4822d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4823d9fead3dSBarry Smith   const PetscScalar *b;
48244e2b4712SSatish Balay 
48254e2b4712SSatish Balay   PetscFunctionBegin;
48263649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
48271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4828f1af5d2fSBarry Smith   t  = a->solve_work;
48294e2b4712SSatish Balay 
48304e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
48314e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
48324e2b4712SSatish Balay 
48334e2b4712SSatish Balay   /* forward solve the lower triangular */
48344e2b4712SSatish Balay   idx    = 3*(*r++);
4835f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
48364e2b4712SSatish Balay   for (i=1; i<n; i++) {
48374e2b4712SSatish Balay     v     = aa + 9*ai[i];
48384e2b4712SSatish Balay     vi    = aj + ai[i];
48394e2b4712SSatish Balay     nz    = diag[i] - ai[i];
48404e2b4712SSatish Balay     idx   = 3*(*r++);
4841f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
48424e2b4712SSatish Balay     while (nz--) {
48434e2b4712SSatish Balay       idx   = 3*(*vi++);
4844f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4845f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4846f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4847f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48484e2b4712SSatish Balay       v += 9;
48494e2b4712SSatish Balay     }
48504e2b4712SSatish Balay     idx = 3*i;
4851f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
48524e2b4712SSatish Balay   }
48534e2b4712SSatish Balay   /* backward solve the upper triangular */
48544e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
48554e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
48564e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
48574e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
48584e2b4712SSatish Balay     idt  = 3*i;
4859f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
48604e2b4712SSatish Balay     while (nz--) {
48614e2b4712SSatish Balay       idx   = 3*(*vi++);
4862f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4863f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4864f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4865f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48664e2b4712SSatish Balay       v += 9;
48674e2b4712SSatish Balay     }
48684e2b4712SSatish Balay     idc = 3*(*c--);
48694e2b4712SSatish Balay     v   = aa + 9*diag[i];
4870f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4871f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4872f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
48734e2b4712SSatish Balay   }
48744e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
48754e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
48763649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
48771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4878dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
48794e2b4712SSatish Balay   PetscFunctionReturn(0);
48804e2b4712SSatish Balay }
48814e2b4712SSatish Balay 
48820c4413a7SShri Abhyankar #undef __FUNCT__
48834dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3"
48844dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
48850c4413a7SShri Abhyankar {
48860c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
48870c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
48880c4413a7SShri Abhyankar   PetscErrorCode    ierr;
4889b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4890b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
48910c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
48920c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
48930c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
48940c4413a7SShri Abhyankar   const PetscScalar *b;
48950c4413a7SShri Abhyankar 
48960c4413a7SShri Abhyankar   PetscFunctionBegin;
48973649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
48980c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
48990c4413a7SShri Abhyankar   t  = a->solve_work;
49000c4413a7SShri Abhyankar 
49010c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
49020c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
49030c4413a7SShri Abhyankar 
49040c4413a7SShri Abhyankar   /* forward solve the lower triangular */
49050c4413a7SShri Abhyankar   idx    = 3*r[0];
49060c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
49070c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
49080c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
49090c4413a7SShri Abhyankar     vi    = aj + ai[i];
49100c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
49110c4413a7SShri Abhyankar     idx   = 3*r[i];
49120c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
49130c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
49140c4413a7SShri Abhyankar       idx   = 3*vi[m];
49150c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
49160c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
49170c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
49180c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
49190c4413a7SShri Abhyankar       v += 9;
49200c4413a7SShri Abhyankar     }
49210c4413a7SShri Abhyankar     idx = 3*i;
49220c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
49230c4413a7SShri Abhyankar   }
49240c4413a7SShri Abhyankar   /* backward solve the upper triangular */
49250c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
49260c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
49270c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
49280c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
49290c4413a7SShri Abhyankar     idt  = 3*i;
49300c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
49310c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
49320c4413a7SShri Abhyankar       idx   = 3*vi[m];
49330c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
49340c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
49350c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
49360c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
49370c4413a7SShri Abhyankar       v += 9;
49380c4413a7SShri Abhyankar     }
49390c4413a7SShri Abhyankar     idc = 3*c[i];
49400c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
49410c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
49420c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
49430c4413a7SShri Abhyankar   }
49440c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
49450c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
49463649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
49470c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
49480c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
49490c4413a7SShri Abhyankar   PetscFunctionReturn(0);
49500c4413a7SShri Abhyankar }
49510c4413a7SShri Abhyankar 
495215091d37SBarry Smith /*
495315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
495415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
495515091d37SBarry Smith */
49564a2ae208SSatish Balay #undef __FUNCT__
495706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
495806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
495915091d37SBarry Smith {
496015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
49610b68f018SBarry Smith   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4962dfbe8321SBarry Smith   PetscErrorCode    ierr;
49630b68f018SBarry Smith   const PetscInt    *diag = a->diag,*vi;
4964d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4965d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4966d9fead3dSBarry Smith   const PetscScalar *b;
49670b68f018SBarry Smith   PetscInt          jdx,idt,idx,nz,i;
496815091d37SBarry Smith 
496915091d37SBarry Smith   PetscFunctionBegin;
49703649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
49711ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
497215091d37SBarry Smith 
497315091d37SBarry Smith   /* forward solve the lower triangular */
497415091d37SBarry Smith   idx    = 0;
497515091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
497615091d37SBarry Smith   for (i=1; i<n; i++) {
497715091d37SBarry Smith     v     =  aa      + 9*ai[i];
497815091d37SBarry Smith     vi    =  aj      + ai[i];
497915091d37SBarry Smith     nz    =  diag[i] - ai[i];
498015091d37SBarry Smith     idx   +=  3;
4981f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
498215091d37SBarry Smith     while (nz--) {
498315091d37SBarry Smith       jdx   = 3*(*vi++);
498415091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4985f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4986f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4987f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
498815091d37SBarry Smith       v    += 9;
498915091d37SBarry Smith     }
4990f1af5d2fSBarry Smith     x[idx]   = s1;
4991f1af5d2fSBarry Smith     x[1+idx] = s2;
4992f1af5d2fSBarry Smith     x[2+idx] = s3;
499315091d37SBarry Smith   }
499415091d37SBarry Smith   /* backward solve the upper triangular */
499515091d37SBarry Smith   for (i=n-1; i>=0; i--){
499615091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
499715091d37SBarry Smith     vi   = aj + diag[i] + 1;
499815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
499915091d37SBarry Smith     idt  = 3*i;
5000f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
5001f1af5d2fSBarry Smith     s3 = x[2+idt];
500215091d37SBarry Smith     while (nz--) {
500315091d37SBarry Smith       idx   = 3*(*vi++);
500415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
5005f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5006f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5007f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
500815091d37SBarry Smith       v    += 9;
500915091d37SBarry Smith     }
501015091d37SBarry Smith     v        = aa +  9*diag[i];
5011f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5012f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5013f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
501415091d37SBarry Smith   }
501515091d37SBarry Smith 
50163649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
50171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5018dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
501915091d37SBarry Smith   PetscFunctionReturn(0);
502015091d37SBarry Smith }
502115091d37SBarry Smith 
5022cee9d6f2SShri Abhyankar #undef __FUNCT__
50234dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
50244dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
5025b2b2dd24SShri Abhyankar {
5026b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5027b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5028b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
5029b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
5030b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
5031b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
5032b2b2dd24SShri Abhyankar     PetscScalar       *x;
5033b2b2dd24SShri Abhyankar     const PetscScalar *b;
5034b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
5035b2b2dd24SShri Abhyankar 
5036b2b2dd24SShri Abhyankar     PetscFunctionBegin;
50373649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5038b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5039b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
5040b2b2dd24SShri Abhyankar     idx    = 0;
5041b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5042b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
5043b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
5044b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
5045b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
5046b2b2dd24SShri Abhyankar       idx   = bs*i;
5047b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5048b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
5049b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
5050b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5051b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5052b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5053b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5054b2b2dd24SShri Abhyankar 
5055b2b2dd24SShri Abhyankar           v   +=  bs2;
5056b2b2dd24SShri Abhyankar         }
5057b2b2dd24SShri Abhyankar 
5058b2b2dd24SShri Abhyankar        x[idx]   = s1;
5059b2b2dd24SShri Abhyankar        x[1+idx] = s2;
5060b2b2dd24SShri Abhyankar        x[2+idx] = s3;
5061b2b2dd24SShri Abhyankar     }
5062b2b2dd24SShri Abhyankar 
5063b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
5064b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
5065b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
5066b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
5067b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
5068b2b2dd24SShri Abhyankar      idt = bs*i;
5069b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5070b2b2dd24SShri Abhyankar 
5071b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
5072b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
5073b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5074b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5075b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5076b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5077b2b2dd24SShri Abhyankar 
5078b2b2dd24SShri Abhyankar         v   +=  bs2;
5079b2b2dd24SShri Abhyankar     }
5080b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5081b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5082b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5083b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5084b2b2dd24SShri Abhyankar 
5085b2b2dd24SShri Abhyankar   }
5086b2b2dd24SShri Abhyankar 
50873649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5088b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5089b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5090b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5091b2b2dd24SShri Abhyankar }
5092b2b2dd24SShri Abhyankar 
5093b2b2dd24SShri Abhyankar #undef __FUNCT__
509406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
509506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
50964e2b4712SSatish Balay {
50974e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
50984e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
50996849ba73SBarry Smith   PetscErrorCode    ierr;
5100b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5101b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
51025d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5103d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5104d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
5105d9fead3dSBarry Smith   const PetscScalar *b;
51064e2b4712SSatish Balay 
51074e2b4712SSatish Balay   PetscFunctionBegin;
51083649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
51091ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5110f1af5d2fSBarry Smith   t  = a->solve_work;
51114e2b4712SSatish Balay 
51124e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
51134e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
51144e2b4712SSatish Balay 
51154e2b4712SSatish Balay   /* forward solve the lower triangular */
51164e2b4712SSatish Balay   idx    = 2*(*r++);
5117f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
51184e2b4712SSatish Balay   for (i=1; i<n; i++) {
51194e2b4712SSatish Balay     v     = aa + 4*ai[i];
51204e2b4712SSatish Balay     vi    = aj + ai[i];
51214e2b4712SSatish Balay     nz    = diag[i] - ai[i];
51224e2b4712SSatish Balay     idx   = 2*(*r++);
5123f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
51244e2b4712SSatish Balay     while (nz--) {
51254e2b4712SSatish Balay       idx   = 2*(*vi++);
5126f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5127f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5128f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
51294e2b4712SSatish Balay       v += 4;
51304e2b4712SSatish Balay     }
51314e2b4712SSatish Balay     idx = 2*i;
5132f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
51334e2b4712SSatish Balay   }
51344e2b4712SSatish Balay   /* backward solve the upper triangular */
51354e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
51364e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
51374e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
51384e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
51394e2b4712SSatish Balay     idt  = 2*i;
5140f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
51414e2b4712SSatish Balay     while (nz--) {
51424e2b4712SSatish Balay       idx   = 2*(*vi++);
5143f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5144f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5145f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
51464e2b4712SSatish Balay       v += 4;
51474e2b4712SSatish Balay     }
51484e2b4712SSatish Balay     idc = 2*(*c--);
51494e2b4712SSatish Balay     v   = aa + 4*diag[i];
5150f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5151f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
51524e2b4712SSatish Balay   }
51534e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
51544e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
51553649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
51561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5157dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
51584e2b4712SSatish Balay   PetscFunctionReturn(0);
51594e2b4712SSatish Balay }
51604e2b4712SSatish Balay 
51610c4413a7SShri Abhyankar #undef __FUNCT__
51624dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2"
51634dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
51640c4413a7SShri Abhyankar {
51650c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
51660c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
51670c4413a7SShri Abhyankar   PetscErrorCode    ierr;
5168b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5169b3260449SShri Abhyankar   PetscInt          i,nz,idx,jdx,idt,idc,m;
51700c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
51710c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
51720c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
51730c4413a7SShri Abhyankar   const PetscScalar *b;
51740c4413a7SShri Abhyankar 
51750c4413a7SShri Abhyankar   PetscFunctionBegin;
51763649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
51770c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
51780c4413a7SShri Abhyankar   t  = a->solve_work;
51790c4413a7SShri Abhyankar 
51800c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
51810c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
51820c4413a7SShri Abhyankar 
51830c4413a7SShri Abhyankar   /* forward solve the lower triangular */
51840c4413a7SShri Abhyankar   idx    = 2*r[0];
51850c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
51860c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
51870c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
51880c4413a7SShri Abhyankar     vi    = aj + ai[i];
51890c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
51900c4413a7SShri Abhyankar     idx   = 2*r[i];
51910c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
51920c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
51930c4413a7SShri Abhyankar       jdx   = 2*vi[m];
51940c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
51950c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
51960c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
51970c4413a7SShri Abhyankar       v += 4;
51980c4413a7SShri Abhyankar     }
51990c4413a7SShri Abhyankar     idx = 2*i;
52000c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
52010c4413a7SShri Abhyankar   }
52020c4413a7SShri Abhyankar   /* backward solve the upper triangular */
52030c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
52040c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
52050c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
52060c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
52070c4413a7SShri Abhyankar     idt  = 2*i;
52080c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
52090c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
52100c4413a7SShri Abhyankar       idx   = 2*vi[m];
52110c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
52120c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
52130c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
52140c4413a7SShri Abhyankar       v += 4;
52150c4413a7SShri Abhyankar     }
52160c4413a7SShri Abhyankar     idc = 2*c[i];
52170c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
52180c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
52190c4413a7SShri Abhyankar   }
52200c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
52210c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
52223649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
52230c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52240c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
52250c4413a7SShri Abhyankar   PetscFunctionReturn(0);
52260c4413a7SShri Abhyankar }
52278f690400SShri Abhyankar 
522815091d37SBarry Smith /*
522915091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
523015091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
523115091d37SBarry Smith */
52324a2ae208SSatish Balay #undef __FUNCT__
523306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
523406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
523515091d37SBarry Smith {
523615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5237b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5238dfbe8321SBarry Smith   PetscErrorCode    ierr;
5239d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5240d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
5241d9fead3dSBarry Smith   const PetscScalar *b;
5242b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
524315091d37SBarry Smith 
524415091d37SBarry Smith   PetscFunctionBegin;
52453649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
52461ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524715091d37SBarry Smith 
524815091d37SBarry Smith   /* forward solve the lower triangular */
524915091d37SBarry Smith   idx    = 0;
525015091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
525115091d37SBarry Smith   for (i=1; i<n; i++) {
525215091d37SBarry Smith     v     =  aa      + 4*ai[i];
525315091d37SBarry Smith     vi    =  aj      + ai[i];
525415091d37SBarry Smith     nz    =  diag[i] - ai[i];
525515091d37SBarry Smith     idx   +=  2;
5256f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
525715091d37SBarry Smith     while (nz--) {
525815091d37SBarry Smith       jdx   = 2*(*vi++);
525915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
5260f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5261f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
526215091d37SBarry Smith       v    += 4;
526315091d37SBarry Smith     }
5264f1af5d2fSBarry Smith     x[idx]   = s1;
5265f1af5d2fSBarry Smith     x[1+idx] = s2;
526615091d37SBarry Smith   }
526715091d37SBarry Smith   /* backward solve the upper triangular */
526815091d37SBarry Smith   for (i=n-1; i>=0; i--){
526915091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
527015091d37SBarry Smith     vi   = aj + diag[i] + 1;
527115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
527215091d37SBarry Smith     idt  = 2*i;
5273f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
527415091d37SBarry Smith     while (nz--) {
527515091d37SBarry Smith       idx   = 2*(*vi++);
527615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
5277f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5278f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
527915091d37SBarry Smith       v    += 4;
528015091d37SBarry Smith     }
528115091d37SBarry Smith     v        = aa +  4*diag[i];
5282f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
5283f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
528415091d37SBarry Smith   }
528515091d37SBarry Smith 
52863649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
52871ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5288dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
528915091d37SBarry Smith   PetscFunctionReturn(0);
529015091d37SBarry Smith }
529115091d37SBarry Smith 
5292cee9d6f2SShri Abhyankar #undef __FUNCT__
52934dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
52944dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5295b2b2dd24SShri Abhyankar {
5296b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5297b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5298b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,idt,jdx;
5299b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
5300b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
5301b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
5302b2b2dd24SShri Abhyankar     const PetscScalar *b;
5303b2b2dd24SShri Abhyankar 
5304b2b2dd24SShri Abhyankar     PetscFunctionBegin;
53053649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5306b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5307b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
5308b2b2dd24SShri Abhyankar     idx    = 0;
5309b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
5310b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
5311b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
5312b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
5313b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
5314b2b2dd24SShri Abhyankar        idx  = 2*i;
5315b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
5316b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
5317b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
5318b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
5319b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
5320b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
5321b2b2dd24SShri Abhyankar            v   +=  4;
5322b2b2dd24SShri Abhyankar         }
5323b2b2dd24SShri Abhyankar        x[idx]   = s1;
5324b2b2dd24SShri Abhyankar        x[1+idx] = s2;
5325b2b2dd24SShri Abhyankar     }
5326b2b2dd24SShri Abhyankar 
5327b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
5328b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
5329b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
5330b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
5331b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
5332b2b2dd24SShri Abhyankar      idt = 2*i;
5333b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
5334b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
5335b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
5336b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
5337b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
5338b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
5339b2b2dd24SShri Abhyankar          v    += 4;
5340b2b2dd24SShri Abhyankar     }
5341b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5342b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
5343b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
5344b2b2dd24SShri Abhyankar   }
5345b2b2dd24SShri Abhyankar 
53463649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5347b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5348b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5349b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5350b2b2dd24SShri Abhyankar }
5351b2b2dd24SShri Abhyankar 
5352b2b2dd24SShri Abhyankar #undef __FUNCT__
535306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
535406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
53554e2b4712SSatish Balay {
53564e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
53574e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
53586849ba73SBarry Smith   PetscErrorCode    ierr;
5359b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5360b3260449SShri Abhyankar   PetscInt          i,nz;
53615d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5362b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5363b3260449SShri Abhyankar   PetscScalar       *x,s1,*t;
5364b3260449SShri Abhyankar   const PetscScalar *b;
53654e2b4712SSatish Balay 
53664e2b4712SSatish Balay   PetscFunctionBegin;
53674e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
53684e2b4712SSatish Balay 
53693649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
53701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5371f1af5d2fSBarry Smith   t  = a->solve_work;
53724e2b4712SSatish Balay 
53734e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
53744e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
53754e2b4712SSatish Balay 
53764e2b4712SSatish Balay   /* forward solve the lower triangular */
5377f1af5d2fSBarry Smith   t[0] = b[*r++];
53784e2b4712SSatish Balay   for (i=1; i<n; i++) {
53794e2b4712SSatish Balay     v     = aa + ai[i];
53804e2b4712SSatish Balay     vi    = aj + ai[i];
53814e2b4712SSatish Balay     nz    = diag[i] - ai[i];
5382f1af5d2fSBarry Smith     s1  = b[*r++];
53834e2b4712SSatish Balay     while (nz--) {
5384f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53854e2b4712SSatish Balay     }
5386f1af5d2fSBarry Smith     t[i] = s1;
53874e2b4712SSatish Balay   }
53884e2b4712SSatish Balay   /* backward solve the upper triangular */
53894e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
53904e2b4712SSatish Balay     v    = aa + diag[i] + 1;
53914e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
53924e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5393f1af5d2fSBarry Smith     s1 = t[i];
53944e2b4712SSatish Balay     while (nz--) {
5395f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53964e2b4712SSatish Balay     }
5397f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
53984e2b4712SSatish Balay   }
53994e2b4712SSatish Balay 
54004e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
54014e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
54023649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
54031ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5404dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
54054e2b4712SSatish Balay   PetscFunctionReturn(0);
54064e2b4712SSatish Balay }
5407048b5e81SShri Abhyankar 
5408048b5e81SShri Abhyankar #undef __FUNCT__
5409048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5410048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5411048b5e81SShri Abhyankar {
5412048b5e81SShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5413048b5e81SShri Abhyankar   IS                iscol = a->col,isrow = a->row;
5414048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5415048b5e81SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5416048b5e81SShri Abhyankar   const PetscInt    *rout,*cout,*r,*c;
5417048b5e81SShri Abhyankar   PetscScalar       *x,*tmp,sum;
5418048b5e81SShri Abhyankar   const PetscScalar *b;
5419048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5420048b5e81SShri Abhyankar 
5421048b5e81SShri Abhyankar   PetscFunctionBegin;
5422048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5423048b5e81SShri Abhyankar 
54243649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5425048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5426048b5e81SShri Abhyankar   tmp  = a->solve_work;
5427048b5e81SShri Abhyankar 
5428048b5e81SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5429048b5e81SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5430048b5e81SShri Abhyankar 
5431048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5432048b5e81SShri Abhyankar   tmp[0] = b[r[0]];
5433048b5e81SShri Abhyankar   v      = aa;
5434048b5e81SShri Abhyankar   vi     = aj;
5435048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5436048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5437048b5e81SShri Abhyankar     sum = b[r[i]];
5438048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5439048b5e81SShri Abhyankar     tmp[i] = sum;
5440048b5e81SShri Abhyankar     v += nz; vi += nz;
5441048b5e81SShri Abhyankar   }
5442048b5e81SShri Abhyankar 
5443048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5444048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--){
5445048b5e81SShri Abhyankar     v   = aa + adiag[i+1]+1;
5446048b5e81SShri Abhyankar     vi  = aj + adiag[i+1]+1;
5447048b5e81SShri Abhyankar     nz  = adiag[i]-adiag[i+1]-1;
5448048b5e81SShri Abhyankar     sum = tmp[i];
5449048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5450048b5e81SShri Abhyankar     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5451048b5e81SShri Abhyankar   }
5452048b5e81SShri Abhyankar 
5453048b5e81SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5454048b5e81SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
54553649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5456048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5457048b5e81SShri Abhyankar   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5458048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5459048b5e81SShri Abhyankar }
5460048b5e81SShri Abhyankar 
546115091d37SBarry Smith /*
546215091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
546315091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
546415091d37SBarry Smith */
54654a2ae208SSatish Balay #undef __FUNCT__
546606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
546706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
546815091d37SBarry Smith {
546915091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5470b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5471dfbe8321SBarry Smith   PetscErrorCode    ierr;
5472b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5473b3260449SShri Abhyankar   PetscScalar       *x;
5474b3260449SShri Abhyankar   const PetscScalar *b;
547587828ca2SBarry Smith   PetscScalar       s1,x1;
5476b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
547715091d37SBarry Smith 
547815091d37SBarry Smith   PetscFunctionBegin;
54793649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
54801ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
548115091d37SBarry Smith 
548215091d37SBarry Smith   /* forward solve the lower triangular */
548315091d37SBarry Smith   idx    = 0;
548415091d37SBarry Smith   x[0]   = b[0];
548515091d37SBarry Smith   for (i=1; i<n; i++) {
548615091d37SBarry Smith     v     =  aa      + ai[i];
548715091d37SBarry Smith     vi    =  aj      + ai[i];
548815091d37SBarry Smith     nz    =  diag[i] - ai[i];
548915091d37SBarry Smith     idx   +=  1;
5490f1af5d2fSBarry Smith     s1  =  b[idx];
549115091d37SBarry Smith     while (nz--) {
549215091d37SBarry Smith       jdx   = *vi++;
549315091d37SBarry Smith       x1    = x[jdx];
5494f1af5d2fSBarry Smith       s1 -= v[0]*x1;
549515091d37SBarry Smith       v    += 1;
549615091d37SBarry Smith     }
5497f1af5d2fSBarry Smith     x[idx]   = s1;
549815091d37SBarry Smith   }
549915091d37SBarry Smith   /* backward solve the upper triangular */
550015091d37SBarry Smith   for (i=n-1; i>=0; i--){
550115091d37SBarry Smith     v    = aa + diag[i] + 1;
550215091d37SBarry Smith     vi   = aj + diag[i] + 1;
550315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
550415091d37SBarry Smith     idt  = i;
5505f1af5d2fSBarry Smith     s1 = x[idt];
550615091d37SBarry Smith     while (nz--) {
550715091d37SBarry Smith       idx   = *vi++;
550815091d37SBarry Smith       x1    = x[idx];
5509f1af5d2fSBarry Smith       s1 -= v[0]*x1;
551015091d37SBarry Smith       v    += 1;
551115091d37SBarry Smith     }
551215091d37SBarry Smith     v        = aa +  diag[i];
5513f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
551415091d37SBarry Smith   }
55153649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
55161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5517dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
551815091d37SBarry Smith   PetscFunctionReturn(0);
551915091d37SBarry Smith }
55204e2b4712SSatish Balay 
5521048b5e81SShri Abhyankar 
5522048b5e81SShri Abhyankar #undef __FUNCT__
5523048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5524048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5525048b5e81SShri Abhyankar {
5526048b5e81SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5527048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5528048b5e81SShri Abhyankar   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5529048b5e81SShri Abhyankar   PetscScalar       *x,sum;
5530048b5e81SShri Abhyankar   const PetscScalar *b;
5531048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5532048b5e81SShri Abhyankar   PetscInt          i,nz;
5533048b5e81SShri Abhyankar 
5534048b5e81SShri Abhyankar   PetscFunctionBegin;
5535048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5536048b5e81SShri Abhyankar 
55373649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5538048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5539048b5e81SShri Abhyankar 
5540048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5541048b5e81SShri Abhyankar   x[0] = b[0];
5542048b5e81SShri Abhyankar   v    = aa;
5543048b5e81SShri Abhyankar   vi   = aj;
5544048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5545048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5546048b5e81SShri Abhyankar     sum = b[i];
5547048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5548048b5e81SShri Abhyankar     v  += nz;
5549048b5e81SShri Abhyankar     vi += nz;
5550048b5e81SShri Abhyankar     x[i] = sum;
5551048b5e81SShri Abhyankar   }
5552048b5e81SShri Abhyankar 
5553048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5554048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--){
5555048b5e81SShri Abhyankar     v   = aa + adiag[i+1] + 1;
5556048b5e81SShri Abhyankar     vi  = aj + adiag[i+1] + 1;
5557048b5e81SShri Abhyankar     nz = adiag[i] - adiag[i+1]-1;
5558048b5e81SShri Abhyankar     sum = x[i];
5559048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5560048b5e81SShri Abhyankar     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5561048b5e81SShri Abhyankar   }
5562048b5e81SShri Abhyankar 
5563048b5e81SShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
55643649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5565048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5566048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5567048b5e81SShri Abhyankar }
5568048b5e81SShri Abhyankar 
55694e2b4712SSatish Balay /* ----------------------------------------------------------------*/
5570*09573ac7SBarry Smith extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool );
55716bce7ff8SHong Zhang 
55722b0b2ea7SShri Abhyankar #undef __FUNCT__
557329a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5574766f9fbaSBarry Smith /*
5575766f9fbaSBarry Smith    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5576766f9fbaSBarry Smith */
557729a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
55782b0b2ea7SShri Abhyankar {
55792b0b2ea7SShri Abhyankar   Mat             C=B;
55802b0b2ea7SShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
55812b0b2ea7SShri Abhyankar   PetscErrorCode  ierr;
5582766f9fbaSBarry Smith   PetscInt        i,j,k,ipvt[15];
5583766f9fbaSBarry Smith   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5584766f9fbaSBarry Smith   PetscInt        nz,nzL,row;
5585766f9fbaSBarry Smith   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5586766f9fbaSBarry Smith   const MatScalar *v,*aa=a->a;
55872b0b2ea7SShri Abhyankar   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
55880fa040f9SShri Abhyankar   PetscInt        sol_ver;
55892b0b2ea7SShri Abhyankar 
55902b0b2ea7SShri Abhyankar   PetscFunctionBegin;
55912b0b2ea7SShri Abhyankar 
55920fa040f9SShri Abhyankar   ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
55930fa040f9SShri Abhyankar 
55942b0b2ea7SShri Abhyankar   /* generate work space needed by the factorization */
55952b0b2ea7SShri Abhyankar   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
55962b0b2ea7SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
55972b0b2ea7SShri Abhyankar 
55982b0b2ea7SShri Abhyankar   for (i=0; i<n; i++){
55992b0b2ea7SShri Abhyankar     /* zero rtmp */
56002b0b2ea7SShri Abhyankar     /* L part */
56012b0b2ea7SShri Abhyankar     nz    = bi[i+1] - bi[i];
56022b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
56032b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++){
56042b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56052b0b2ea7SShri Abhyankar     }
56062b0b2ea7SShri Abhyankar 
56072b0b2ea7SShri Abhyankar     /* U part */
56082b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
56092b0b2ea7SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
56102b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++){
56112b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56122b0b2ea7SShri Abhyankar     }
56132b0b2ea7SShri Abhyankar 
56142b0b2ea7SShri Abhyankar     /* load in initial (unfactored row) */
561529a97285SShri Abhyankar     nz    = ai[i+1] - ai[i];
561629a97285SShri Abhyankar     ajtmp = aj + ai[i];
561729a97285SShri Abhyankar     v     = aa + bs2*ai[i];
56182b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
561929a97285SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
56202b0b2ea7SShri Abhyankar     }
56212b0b2ea7SShri Abhyankar 
56222b0b2ea7SShri Abhyankar     /* elimination */
56232b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
56242b0b2ea7SShri Abhyankar     nzL   = bi[i+1] - bi[i];
56252b0b2ea7SShri Abhyankar     for(k=0;k < nzL;k++) {
56262b0b2ea7SShri Abhyankar       row = bjtmp[k];
56272b0b2ea7SShri Abhyankar       pc = rtmp + bs2*row;
56282b0b2ea7SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
56292b0b2ea7SShri Abhyankar       if (flg) {
56302b0b2ea7SShri Abhyankar         pv = b->a + bs2*bdiag[row];
5631766f9fbaSBarry Smith 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5632766f9fbaSBarry Smith 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
56332b0b2ea7SShri Abhyankar 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
56342b0b2ea7SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
56352b0b2ea7SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
56362b0b2ea7SShri Abhyankar         for (j=0; j<nz; j++) {
5637766f9fbaSBarry Smith           vv   = rtmp + bs2*pj[j];
5638766f9fbaSBarry Smith           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5639766f9fbaSBarry Smith 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
56402b0b2ea7SShri Abhyankar 	  pv  += bs2;
56412b0b2ea7SShri Abhyankar         }
5642766f9fbaSBarry Smith         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
56432b0b2ea7SShri Abhyankar       }
56442b0b2ea7SShri Abhyankar     }
56452b0b2ea7SShri Abhyankar 
56462b0b2ea7SShri Abhyankar     /* finished row so stick it into b->a */
56472b0b2ea7SShri Abhyankar     /* L part */
56482b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
56492b0b2ea7SShri Abhyankar     pj   = b->j + bi[i] ;
56502b0b2ea7SShri Abhyankar     nz   = bi[i+1] - bi[i];
56512b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
56522b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56532b0b2ea7SShri Abhyankar     }
56542b0b2ea7SShri Abhyankar 
56552b0b2ea7SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
56562b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
56572b0b2ea7SShri Abhyankar     pj   = b->j + bdiag[i];
56582b0b2ea7SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5659766f9fbaSBarry Smith     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5660182b8fbaSHong Zhang     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
56612b0b2ea7SShri Abhyankar 
56622b0b2ea7SShri Abhyankar     /* U part */
56632b0b2ea7SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
56642b0b2ea7SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
56652b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
56662b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++){
56672b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56682b0b2ea7SShri Abhyankar     }
56692b0b2ea7SShri Abhyankar   }
56702b0b2ea7SShri Abhyankar 
56712b0b2ea7SShri Abhyankar   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5672832cc040SShri Abhyankar   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5673766f9fbaSBarry Smith   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
56742b0b2ea7SShri Abhyankar   C->assembled = PETSC_TRUE;
5675766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
56762b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
56772b0b2ea7SShri Abhyankar }
56782b0b2ea7SShri Abhyankar 
56796bce7ff8SHong Zhang #undef __FUNCT__
56804dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
56814dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
56826bce7ff8SHong Zhang {
56836bce7ff8SHong Zhang   Mat            C=B;
56846bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
56856bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
56866bce7ff8SHong Zhang   PetscErrorCode ierr;
56875a586d82SBarry Smith   const PetscInt *r,*ic;
56886bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
56896bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5690b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5691914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5692914a18a2SHong Zhang   MatScalar      *v_work;
5693ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity;
56946bce7ff8SHong Zhang 
56956bce7ff8SHong Zhang   PetscFunctionBegin;
56966bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
56976bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5698ae3d28f0SHong Zhang 
5699fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5700fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
57016bce7ff8SHong Zhang 
5702914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
5703fca92195SBarry Smith   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5704914a18a2SHong Zhang 
57056bce7ff8SHong Zhang   for (i=0; i<n; i++){
57066bce7ff8SHong Zhang     /* zero rtmp */
57076bce7ff8SHong Zhang     /* L part */
57086bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
57096bce7ff8SHong Zhang     bjtmp = bj + bi[i];
5710914a18a2SHong Zhang     for  (j=0; j<nz; j++){
5711914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5712914a18a2SHong Zhang     }
57136bce7ff8SHong Zhang 
57146bce7ff8SHong Zhang     /* U part */
57151a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
57161a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
57171a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
57181a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57191a83e813SShri Abhyankar     }
57201a83e813SShri Abhyankar 
57211a83e813SShri Abhyankar     /* load in initial (unfactored row) */
57221a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
57231a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
57241a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
57251a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
57261a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
57271a83e813SShri Abhyankar     }
57281a83e813SShri Abhyankar 
57291a83e813SShri Abhyankar     /* elimination */
57301a83e813SShri Abhyankar     bjtmp = bj + bi[i];
57311a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
57321a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
57331a83e813SShri Abhyankar       row = bjtmp[k];
57341a83e813SShri Abhyankar       pc = rtmp + bs2*row;
57351a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
57361a83e813SShri Abhyankar       if (flg) {
57371a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
57381a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
57391a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
57401a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
57411a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
57421a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
57431a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
57441a83e813SShri Abhyankar         }
57451a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
57461a83e813SShri Abhyankar       }
57471a83e813SShri Abhyankar     }
57481a83e813SShri Abhyankar 
57491a83e813SShri Abhyankar     /* finished row so stick it into b->a */
57501a83e813SShri Abhyankar     /* L part */
57511a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
57521a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
57531a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
57541a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
57551a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57561a83e813SShri Abhyankar     }
57571a83e813SShri Abhyankar 
57581a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
57591a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
57601a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
5761e32f2f54SBarry Smith     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
57621a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57631a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
57641a83e813SShri Abhyankar 
57651a83e813SShri Abhyankar     /* U part */
57661a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
57671a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
57681a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
57691a83e813SShri Abhyankar     for (j=0; j<nz; j++){
57701a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57711a83e813SShri Abhyankar     }
57721a83e813SShri Abhyankar   }
57731a83e813SShri Abhyankar 
57741a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5775fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
57761a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
57771a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
57781a83e813SShri Abhyankar 
5779ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5780ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5781ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
5782ae3d28f0SHong Zhang   if (both_identity){
57834dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5784ae3d28f0SHong Zhang   } else {
57854dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N;
5786ae3d28f0SHong Zhang   }
57874dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5788ae3d28f0SHong Zhang 
57891a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
5790766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
57911a83e813SShri Abhyankar   PetscFunctionReturn(0);
57921a83e813SShri Abhyankar }
57931a83e813SShri Abhyankar 
57946bce7ff8SHong Zhang /*
57956bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
57964dd39f65SShri Abhyankar    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
57974dd39f65SShri Abhyankar    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
57986bce7ff8SHong Zhang */
5799c0c7eb62SShri Abhyankar 
58006bce7ff8SHong Zhang #undef __FUNCT__
58014dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
58024dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
58036bce7ff8SHong Zhang {
58046bce7ff8SHong Zhang 
58056bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
58066bce7ff8SHong Zhang   PetscErrorCode     ierr;
580716a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
580835aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
580935aa4fcfSShri Abhyankar 
581035aa4fcfSShri Abhyankar   PetscFunctionBegin;
581135aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
581235aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
581335aa4fcfSShri Abhyankar 
581435aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
581535aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
581635aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
581735aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_TRUE;
581835aa4fcfSShri Abhyankar   if (!b->diag){
581935aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
582035aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
582135aa4fcfSShri Abhyankar   }
582235aa4fcfSShri Abhyankar   bdiag = b->diag;
582335aa4fcfSShri Abhyankar 
582435aa4fcfSShri Abhyankar   if (n > 0) {
582535aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
582635aa4fcfSShri Abhyankar   }
582735aa4fcfSShri Abhyankar 
582835aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
582935aa4fcfSShri Abhyankar   bi = b->i;
583035aa4fcfSShri Abhyankar   bj = b->j;
583135aa4fcfSShri Abhyankar 
583235aa4fcfSShri Abhyankar   /* L part */
583335aa4fcfSShri Abhyankar   bi[0] = 0;
583435aa4fcfSShri Abhyankar   for (i=0; i<n; i++){
583535aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
583635aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
583735aa4fcfSShri Abhyankar     aj = a->j + ai[i];
583835aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
583935aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
584035aa4fcfSShri Abhyankar     }
584135aa4fcfSShri Abhyankar   }
584235aa4fcfSShri Abhyankar 
584335aa4fcfSShri Abhyankar   /* U part */
584435aa4fcfSShri Abhyankar   bi_temp = bi[n];
584535aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
584635aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
584735aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
584835aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
584935aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
585035aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
585135aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
585235aa4fcfSShri Abhyankar     }
585335aa4fcfSShri Abhyankar     /* diag[i] */
585435aa4fcfSShri Abhyankar     *bj = i; bj++;
585535aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
585635aa4fcfSShri Abhyankar   }
585735aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
585835aa4fcfSShri Abhyankar }
585935aa4fcfSShri Abhyankar 
586035aa4fcfSShri Abhyankar #undef __FUNCT__
58614dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
58624dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
586316a2bf60SHong Zhang {
586416a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
586516a2bf60SHong Zhang   IS                 isicol;
586616a2bf60SHong Zhang   PetscErrorCode     ierr;
586716a2bf60SHong Zhang   const PetscInt     *r,*ic;
58687fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
586916a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
587016a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
587116a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
5872ace3abfcSBarry Smith   PetscBool          col_identity,row_identity,both_identity;
587316a2bf60SHong Zhang   PetscReal          f;
587416a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
587516a2bf60SHong Zhang   PetscBT            lnkbt;
587616a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
587716a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
587816a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5879ace3abfcSBarry Smith   PetscBool          missing;
58807fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
588116a2bf60SHong Zhang 
588216a2bf60SHong Zhang   PetscFunctionBegin;
5883e32f2f54SBarry Smith   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
58846ba06ab7SHong Zhang   if (bs>1){  /* check shifttype */
58856ba06ab7SHong Zhang     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
58866ba06ab7SHong Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
58876ba06ab7SHong Zhang   }
58886ba06ab7SHong Zhang 
588916a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5890e32f2f54SBarry Smith   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
589116a2bf60SHong Zhang 
589216a2bf60SHong Zhang   f             = info->fill;
589316a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
589416a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
589516a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
589616a2bf60SHong Zhang 
589716a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
589816a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5899ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
590016a2bf60SHong Zhang 
59017fa3a6a0SHong Zhang   if (!levels && both_identity) {
590216a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
59034dd39f65SShri Abhyankar     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
59044dd39f65SShri Abhyankar     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
590535aa4fcfSShri Abhyankar 
5906d5f3da31SBarry Smith     fact->factortype               = MAT_FACTOR_ILU;
590735aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
590835aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
590935aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
591035aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
591135aa4fcfSShri Abhyankar     b->row           = isrow;
591235aa4fcfSShri Abhyankar     b->col           = iscol;
591335aa4fcfSShri Abhyankar     b->icol          = isicol;
591435aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
591535aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
591635aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
591735aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
591835aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
591935aa4fcfSShri Abhyankar   }
592035aa4fcfSShri Abhyankar 
592135aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
592235aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
592335aa4fcfSShri Abhyankar 
592435aa4fcfSShri Abhyankar   /* get new row pointers */
592535aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
592635aa4fcfSShri Abhyankar   bi[0] = 0;
592735aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
592835aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
592935aa4fcfSShri Abhyankar   bdiag[0]  = 0;
593035aa4fcfSShri Abhyankar 
5931fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
593235aa4fcfSShri Abhyankar 
593335aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
593435aa4fcfSShri Abhyankar   nlnk = n + 1;
593535aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
593635aa4fcfSShri Abhyankar 
593735aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
593835aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
593935aa4fcfSShri Abhyankar   current_space = free_space;
594035aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
594135aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
594235aa4fcfSShri Abhyankar 
594335aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
594435aa4fcfSShri Abhyankar     nzi = 0;
594535aa4fcfSShri Abhyankar     /* copy current row into linked list */
594635aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
5947e32f2f54SBarry Smith     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
594835aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
594935aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
595035aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
595135aa4fcfSShri Abhyankar     nzi += nlnk;
595235aa4fcfSShri Abhyankar 
595335aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
595435aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
595535aa4fcfSShri Abhyankar       fm = n;
595635aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
595735aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
595835aa4fcfSShri Abhyankar       lnk[fm]    = i;
595935aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
596035aa4fcfSShri Abhyankar       nzi++; dcount++;
596135aa4fcfSShri Abhyankar     }
596235aa4fcfSShri Abhyankar 
596335aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
596435aa4fcfSShri Abhyankar     nzbd = 0;
596535aa4fcfSShri Abhyankar     prow = lnk[n];
596635aa4fcfSShri Abhyankar     while (prow < i) {
596735aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
596835aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
596935aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
597035aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
597135aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
597235aa4fcfSShri Abhyankar       nzi += nlnk;
597335aa4fcfSShri Abhyankar       prow = lnk[prow];
597435aa4fcfSShri Abhyankar       nzbd++;
597535aa4fcfSShri Abhyankar     }
597635aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
597735aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
597835aa4fcfSShri Abhyankar 
597935aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
598035aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
598135aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
598235aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
598335aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
598435aa4fcfSShri Abhyankar       reallocs++;
598535aa4fcfSShri Abhyankar     }
598635aa4fcfSShri Abhyankar 
598735aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
598835aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
598935aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
599035aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
599135aa4fcfSShri Abhyankar 
599235aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
599365e19b50SBarry Smith     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
599435aa4fcfSShri Abhyankar 
599535aa4fcfSShri Abhyankar     current_space->array           += nzi;
599635aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
599735aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
599835aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
599935aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
600035aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
600135aa4fcfSShri Abhyankar   }
600235aa4fcfSShri Abhyankar 
600335aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
600435aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
600535aa4fcfSShri Abhyankar 
600635aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
60079263d837SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
60082ce24eb6SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
600935aa4fcfSShri Abhyankar 
601035aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
601135aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
6012fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
601335aa4fcfSShri Abhyankar 
601435aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
601535aa4fcfSShri Abhyankar   {
6016aef85c9fSShri Abhyankar     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
601735aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
601835aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
601935aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
602035aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
602135aa4fcfSShri Abhyankar     if (diagonal_fill) {
602235aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
602335aa4fcfSShri Abhyankar     }
602435aa4fcfSShri Abhyankar   }
602535aa4fcfSShri Abhyankar #endif
602635aa4fcfSShri Abhyankar 
602735aa4fcfSShri Abhyankar   /* put together the new matrix */
602835aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
602935aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
603035aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
603135aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
603235aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
603335aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
603435aa4fcfSShri Abhyankar   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
603535aa4fcfSShri Abhyankar   b->j          = bj;
603635aa4fcfSShri Abhyankar   b->i          = bi;
603735aa4fcfSShri Abhyankar   b->diag       = bdiag;
603835aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
603935aa4fcfSShri Abhyankar   b->ilen       = 0;
604035aa4fcfSShri Abhyankar   b->imax       = 0;
604135aa4fcfSShri Abhyankar   b->row        = isrow;
604235aa4fcfSShri Abhyankar   b->col        = iscol;
604335aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
604435aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
604535aa4fcfSShri Abhyankar   b->icol       = isicol;
604635aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
604735aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
604835aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
604935aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
605035aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
6051ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
6052ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
6053ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
60544dd39f65SShri Abhyankar   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
605535aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
605635aa4fcfSShri Abhyankar }
605735aa4fcfSShri Abhyankar 
60584e2b4712SSatish Balay /*
60594e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
60604e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
60614e2b4712SSatish Balay    Not a good example of code reuse.
60624e2b4712SSatish Balay */
60634a2ae208SSatish Balay #undef __FUNCT__
606406e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
606506e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
60664e2b4712SSatish Balay {
60674e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
60684e2b4712SSatish Balay   IS             isicol;
60696849ba73SBarry Smith   PetscErrorCode ierr;
60705d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
60715d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6072a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6073d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6074ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity,flg;
6075329f5518SBarry Smith   PetscReal      f;
60764e2b4712SSatish Balay 
60774e2b4712SSatish Balay   PetscFunctionBegin;
60786bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6079e32f2f54SBarry Smith   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
60806bce7ff8SHong Zhang 
6081435faa5fSBarry Smith   f             = info->fill;
6082690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
6083690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
60844c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
608516a2bf60SHong Zhang 
6086667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6087667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6088ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
6089309c388cSBarry Smith 
609041df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
609116a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
60928b1456e3SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
60936bce7ff8SHong Zhang 
6094d5f3da31SBarry Smith     fact->factortype = MAT_FACTOR_ILU;
6095ae3d28f0SHong Zhang     b            = (Mat_SeqBAIJ*)fact->data;
6096bb3d539aSBarry Smith     b->row       = isrow;
6097bb3d539aSBarry Smith     b->col       = iscol;
6098bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6099bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6100bb3d539aSBarry Smith     b->icol      = isicol;
6101bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6102b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
61036bce7ff8SHong Zhang     PetscFunctionReturn(0);
61046bce7ff8SHong Zhang   }
61056bce7ff8SHong Zhang 
61066bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
61074e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
61084e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
61094e2b4712SSatish Balay 
61104e2b4712SSatish Balay     /* get new row pointers */
6111690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
61124e2b4712SSatish Balay     ainew[0] = 0;
61134e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
6114690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
6115690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
61164e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
6117690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
61184e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
6119690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
61204e2b4712SSatish Balay     /* im is level for each filled value */
6121690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
61224e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
6123690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
61244e2b4712SSatish Balay     dloc[0]  = 0;
61254e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
6126435faa5fSBarry Smith 
6127435faa5fSBarry Smith       /* copy prow into linked list */
61284e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6129e32f2f54SBarry Smith       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
61304e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
61314e2b4712SSatish Balay       fill[n]    = n;
6132435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
61334e2b4712SSatish Balay       while (nz--) {
61344e2b4712SSatish Balay 	fm  = n;
61354e2b4712SSatish Balay 	idx = ic[*xi++];
61364e2b4712SSatish Balay 	do {
61374e2b4712SSatish Balay 	  m  = fm;
61384e2b4712SSatish Balay 	  fm = fill[m];
61394e2b4712SSatish Balay 	} while (fm < idx);
61404e2b4712SSatish Balay 	fill[m]   = idx;
61414e2b4712SSatish Balay 	fill[idx] = fm;
61424e2b4712SSatish Balay 	im[idx]   = 0;
61434e2b4712SSatish Balay       }
6144435faa5fSBarry Smith 
6145435faa5fSBarry Smith       /* make sure diagonal entry is included */
6146435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
6147435faa5fSBarry Smith 	fm = n;
6148435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
6149435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
6150435faa5fSBarry Smith 	fill[fm]   = prow;
6151435faa5fSBarry Smith 	im[prow]   = 0;
6152435faa5fSBarry Smith 	nzf++;
6153335d9088SBarry Smith 	dcount++;
6154435faa5fSBarry Smith       }
6155435faa5fSBarry Smith 
61564e2b4712SSatish Balay       nzi = 0;
61574e2b4712SSatish Balay       row = fill[n];
61584e2b4712SSatish Balay       while (row < prow) {
61594e2b4712SSatish Balay 	incrlev = im[row] + 1;
61604e2b4712SSatish Balay 	nz      = dloc[row];
6161435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
61624e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
61634e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
61644e2b4712SSatish Balay 	fm      = row;
61654e2b4712SSatish Balay 	while (nnz-- > 0) {
61664e2b4712SSatish Balay 	  idx = *xi++;
61674e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
61684e2b4712SSatish Balay 	    flev++;
61694e2b4712SSatish Balay 	    continue;
61704e2b4712SSatish Balay 	  }
61714e2b4712SSatish Balay 	  do {
61724e2b4712SSatish Balay 	    m  = fm;
61734e2b4712SSatish Balay 	    fm = fill[m];
61744e2b4712SSatish Balay 	  } while (fm < idx);
61754e2b4712SSatish Balay 	  if (fm != idx) {
61764e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
61774e2b4712SSatish Balay 	    fill[m]   = idx;
61784e2b4712SSatish Balay 	    fill[idx] = fm;
61794e2b4712SSatish Balay 	    fm        = idx;
61804e2b4712SSatish Balay 	    nzf++;
6181ecf371e4SBarry Smith 	  } else {
61824e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
61834e2b4712SSatish Balay 	  }
61844e2b4712SSatish Balay 	  flev++;
61854e2b4712SSatish Balay 	}
61864e2b4712SSatish Balay 	row = fill[row];
61874e2b4712SSatish Balay 	nzi++;
61884e2b4712SSatish Balay       }
61894e2b4712SSatish Balay       /* copy new filled row into permanent storage */
61904e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
61914e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
6192ecf371e4SBarry Smith 
6193ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
6194ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6195ecf371e4SBarry Smith 	/* just double the memory each time */
6196690b6cddSBarry Smith 	PetscInt maxadd = jmax;
6197ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
61984e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
61994e2b4712SSatish Balay 	jmax += maxadd;
6200ecf371e4SBarry Smith 
6201ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
62025d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
62035d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6204606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
62055d0c19d7SBarry Smith 	ajnew = xitmp;
62065d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
62075d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6208606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
62095d0c19d7SBarry Smith 	ajfill = xitmp;
6210eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
62114e2b4712SSatish Balay       }
62125d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
62134e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
62144e2b4712SSatish Balay       dloc[prow]  = nzi;
62154e2b4712SSatish Balay       fm          = fill[n];
62164e2b4712SSatish Balay       while (nzf--) {
62175d0c19d7SBarry Smith 	*xitmp++ = fm;
62184e2b4712SSatish Balay 	*flev++ = im[fm];
62194e2b4712SSatish Balay 	fm      = fill[fm];
62204e2b4712SSatish Balay       }
6221435faa5fSBarry Smith       /* make sure row has diagonal entry */
6222435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6223e32f2f54SBarry Smith 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
62242401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6225435faa5fSBarry Smith       }
62264e2b4712SSatish Balay     }
6227606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
62284e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
62294e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6230606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
6231606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
62324e2b4712SSatish Balay 
62336cf91177SBarry Smith #if defined(PETSC_USE_INFO)
62344e2b4712SSatish Balay     {
6235329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6236ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6237ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6238ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6239ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6240335d9088SBarry Smith       if (diagonal_fill) {
6241ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6242335d9088SBarry Smith       }
62434e2b4712SSatish Balay     }
624463ba0a88SBarry Smith #endif
62454e2b4712SSatish Balay 
62464e2b4712SSatish Balay     /* put together the new matrix */
6247719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6248719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6249ae3d28f0SHong Zhang     b    = (Mat_SeqBAIJ*)fact->data;
6250e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
6251e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
62527c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
6253a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
62544e2b4712SSatish Balay     b->j          = ajnew;
62554e2b4712SSatish Balay     b->i          = ainew;
62564e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
62574e2b4712SSatish Balay     b->diag       = dloc;
62587f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
62594e2b4712SSatish Balay     b->ilen       = 0;
62604e2b4712SSatish Balay     b->imax       = 0;
62614e2b4712SSatish Balay     b->row        = isrow;
62624e2b4712SSatish Balay     b->col        = iscol;
6263bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6264c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6265c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6266e51c0b9cSSatish Balay     b->icol       = isicol;
626787828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
62684e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
62694e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
6270719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
62714e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
62724e2b4712SSatish Balay 
6273ae3d28f0SHong Zhang     fact->info.factor_mallocs    = reallocate;
6274ae3d28f0SHong Zhang     fact->info.fill_ratio_given  = f;
6275ae3d28f0SHong Zhang     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
62766bce7ff8SHong Zhang 
62778b1456e3SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
62788661488fSKris Buschelman   PetscFunctionReturn(0);
62798661488fSKris Buschelman }
62808661488fSKris Buschelman 
6281732ee342SKris Buschelman #undef __FUNCT__
62827e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6283dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
62847e7071cdSKris Buschelman {
628512272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
628612272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
62875a9542e3SKris Buschelman   PetscFunctionBegin;
62887cf1b8d3SKris Buschelman   /* Undo Column scaling */
62897cf1b8d3SKris Buschelman /*    while (nz--) { */
62907cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
62917cf1b8d3SKris Buschelman /*    } */
6292c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
6293c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
62947cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
62957cf1b8d3SKris Buschelman }
62967cf1b8d3SKris Buschelman 
62977cf1b8d3SKris Buschelman #undef __FUNCT__
62987cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6299dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
63007cf1b8d3SKris Buschelman {
63017cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6302b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
63032aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
63045a9542e3SKris Buschelman   PetscFunctionBegin;
63050b9da03eSKris Buschelman   /* Is this really necessary? */
630620235379SKris Buschelman   while (nz--) {
63070b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
63087e7071cdSKris Buschelman   }
6309c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
63107e7071cdSKris Buschelman   PetscFunctionReturn(0);
63117e7071cdSKris Buschelman }
63127e7071cdSKris Buschelman 
6313732ee342SKris Buschelman 
6314