xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision c55dd7997d3dcfad566344de0d981c7ad0a7da26)
1be1d678aSKris Buschelman 
24e2b4712SSatish Balay /*
34e2b4712SSatish Balay     Factorization code for BAIJ format.
44e2b4712SSatish Balay */
54e2b4712SSatish Balay 
6c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
7c6db04a5SJed Brown #include <../src/mat/blockinvert.h>
8c6db04a5SJed Brown #include <petscbt.h>
9c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
1293fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
1393fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1493fd935bSShri Abhyankar {
1593fd935bSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1693fd935bSShri Abhyankar   PetscErrorCode    ierr;
1793fd935bSShri Abhyankar   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
1893fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
1993fd935bSShri Abhyankar   PetscInt          nz;
2093fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
2193fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
2293fd935bSShri Abhyankar   const PetscScalar *b;
2393fd935bSShri Abhyankar 
2493fd935bSShri Abhyankar   PetscFunctionBegin;
253649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2693fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2793fd935bSShri Abhyankar   tmp  = a->solve_work;
2893fd935bSShri Abhyankar 
2993fd935bSShri Abhyankar 
3093fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
3193fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[i];
3293fd935bSShri Abhyankar 
3393fd935bSShri Abhyankar   /* forward solve the U^T */
3493fd935bSShri Abhyankar   for (i=0; i<n; i++) {
3593fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
3693fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
3793fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
3893fd935bSShri Abhyankar     s1  = tmp[i];
3993fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
4093fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
4193fd935bSShri Abhyankar     tmp[i] = s1;
4293fd935bSShri Abhyankar   }
4393fd935bSShri Abhyankar 
4493fd935bSShri Abhyankar   /* backward solve the L^T */
4593fd935bSShri Abhyankar   for (i=n-1; i>=0; i--){
4693fd935bSShri Abhyankar     v   = aa + ai[i];
4793fd935bSShri Abhyankar     vi  = aj + ai[i];
4893fd935bSShri Abhyankar     nz  = ai[i+1] - ai[i];
4993fd935bSShri Abhyankar     s1  = tmp[i];
5093fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
5193fd935bSShri Abhyankar   }
5293fd935bSShri Abhyankar 
5393fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
5493fd935bSShri Abhyankar   for (i=0; i<n; i++) x[i] = tmp[i];
5593fd935bSShri Abhyankar 
563649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5793fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5893fd935bSShri Abhyankar 
5993fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
6093fd935bSShri Abhyankar   PetscFunctionReturn(0);
6193fd935bSShri Abhyankar }
6293fd935bSShri Abhyankar 
6393fd935bSShri Abhyankar #undef __FUNCT__
6406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
6506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66f1af5d2fSBarry Smith {
67f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
68dfbe8321SBarry Smith   PetscErrorCode    ierr;
690b68f018SBarry Smith   PetscInt          i,nz;
700b68f018SBarry Smith   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
710b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
720b68f018SBarry Smith   PetscScalar       s1,*x;
73f1af5d2fSBarry Smith 
74f1af5d2fSBarry Smith   PetscFunctionBegin;
75ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
77f1af5d2fSBarry Smith 
78f1af5d2fSBarry Smith   /* forward solve the U^T */
79f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
80f1af5d2fSBarry Smith 
81f1af5d2fSBarry Smith     v     = aa + diag[i];
82f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
83ef66eb69SBarry Smith     s1    = (*v++)*x[i];
84f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
85f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
86f1af5d2fSBarry Smith     while (nz--) {
87f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
88f1af5d2fSBarry Smith     }
89f1af5d2fSBarry Smith     x[i]   = s1;
90f1af5d2fSBarry Smith   }
91f1af5d2fSBarry Smith   /* backward solve the L^T */
92f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
93f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
94f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
95f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
96f1af5d2fSBarry Smith     s1   = x[i];
97f1af5d2fSBarry Smith     while (nz--) {
98f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
99f1af5d2fSBarry Smith     }
100f1af5d2fSBarry Smith   }
1011ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
102dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
103f1af5d2fSBarry Smith   PetscFunctionReturn(0);
104f1af5d2fSBarry Smith }
105f1af5d2fSBarry Smith 
1064a2ae208SSatish Balay #undef __FUNCT__
10706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
10806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109f1af5d2fSBarry Smith {
110f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
111dfbe8321SBarry Smith   PetscErrorCode    ierr;
112b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
113b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
115b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
116f1af5d2fSBarry Smith 
117f1af5d2fSBarry Smith   PetscFunctionBegin;
118ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1191ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
120f1af5d2fSBarry Smith 
121f1af5d2fSBarry Smith   /* forward solve the U^T */
122f1af5d2fSBarry Smith   idx = 0;
123f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
124f1af5d2fSBarry Smith 
125f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
126f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
127ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
128f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
129f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
130f1af5d2fSBarry Smith     v += 4;
131f1af5d2fSBarry Smith 
132f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
133f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
134f1af5d2fSBarry Smith     while (nz--) {
135f1af5d2fSBarry Smith       oidx = 2*(*vi++);
136f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
137f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
138f1af5d2fSBarry Smith       v  += 4;
139f1af5d2fSBarry Smith     }
140f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
141f1af5d2fSBarry Smith     idx += 2;
142f1af5d2fSBarry Smith   }
143f1af5d2fSBarry Smith   /* backward solve the L^T */
144f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
145f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
146f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
147f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
148f1af5d2fSBarry Smith     idt  = 2*i;
149f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
150f1af5d2fSBarry Smith     while (nz--) {
151f1af5d2fSBarry Smith       idx   = 2*(*vi--);
152f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
153f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
154f1af5d2fSBarry Smith       v -= 4;
155f1af5d2fSBarry Smith     }
156f1af5d2fSBarry Smith   }
1571ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
158dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
159f1af5d2fSBarry Smith   PetscFunctionReturn(0);
160f1af5d2fSBarry Smith }
161f1af5d2fSBarry Smith 
1624a2ae208SSatish Balay #undef __FUNCT__
1634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
1644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1656929473cSShri Abhyankar {
1666929473cSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1676929473cSShri Abhyankar   PetscErrorCode    ierr;
168b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1696929473cSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
170b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
171b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
172b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
1736929473cSShri Abhyankar 
1746929473cSShri Abhyankar   PetscFunctionBegin;
1756929473cSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1766929473cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1776929473cSShri Abhyankar 
1786929473cSShri Abhyankar   /* forward solve the U^T */
1796929473cSShri Abhyankar   idx = 0;
1806929473cSShri Abhyankar   for (i=0; i<n; i++) {
1816929473cSShri Abhyankar     v     = aa + bs2*diag[i];
1826929473cSShri Abhyankar     /* multiply by the inverse of the block diagonal */
1836929473cSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];
1846929473cSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
1856929473cSShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
1866929473cSShri Abhyankar     v -= bs2;
1876929473cSShri Abhyankar 
1886929473cSShri Abhyankar     vi    = aj + diag[i] - 1;
1896929473cSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
1906929473cSShri Abhyankar     for(j=0;j>-nz;j--){
1916929473cSShri Abhyankar       oidx = bs*vi[j];
1926929473cSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2;
1936929473cSShri Abhyankar       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
1946929473cSShri Abhyankar       v  -= bs2;
1956929473cSShri Abhyankar     }
1966929473cSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;
1976929473cSShri Abhyankar     idx += bs;
1986929473cSShri Abhyankar   }
1996929473cSShri Abhyankar   /* backward solve the L^T */
2006929473cSShri Abhyankar   for (i=n-1; i>=0; i--){
2016929473cSShri Abhyankar     v    = aa + bs2*ai[i];
2026929473cSShri Abhyankar     vi   = aj + ai[i];
2036929473cSShri Abhyankar     nz   = ai[i+1] - ai[i];
2046929473cSShri Abhyankar     idt  = bs*i;
2056929473cSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];
2066929473cSShri Abhyankar     for(j=0;j<nz;j++){
2076929473cSShri Abhyankar       idx   = bs*vi[j];
2086929473cSShri Abhyankar       x[idx]   -=  v[0]*s1 +  v[1]*s2;
2096929473cSShri Abhyankar       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
2106929473cSShri Abhyankar       v += bs2;
2116929473cSShri Abhyankar     }
2126929473cSShri Abhyankar   }
2136929473cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2146929473cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2156929473cSShri Abhyankar   PetscFunctionReturn(0);
2166929473cSShri Abhyankar }
2176929473cSShri Abhyankar 
2186929473cSShri Abhyankar #undef __FUNCT__
21906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
22006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221f1af5d2fSBarry Smith {
222f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
223dfbe8321SBarry Smith   PetscErrorCode    ierr;
224b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
226b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
227b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
228f1af5d2fSBarry Smith 
229f1af5d2fSBarry Smith   PetscFunctionBegin;
230ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
232f1af5d2fSBarry Smith 
233f1af5d2fSBarry Smith   /* forward solve the U^T */
234f1af5d2fSBarry Smith   idx = 0;
235f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
236f1af5d2fSBarry Smith 
237f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
238f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
239ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
240f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
241f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
242f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
243f1af5d2fSBarry Smith     v += 9;
244f1af5d2fSBarry Smith 
245f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
246f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
247f1af5d2fSBarry Smith     while (nz--) {
248f1af5d2fSBarry Smith       oidx = 3*(*vi++);
249f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
250f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
251f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252f1af5d2fSBarry Smith       v  += 9;
253f1af5d2fSBarry Smith     }
254f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
255f1af5d2fSBarry Smith     idx += 3;
256f1af5d2fSBarry Smith   }
257f1af5d2fSBarry Smith   /* backward solve the L^T */
258f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
259f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
260f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
261f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
262f1af5d2fSBarry Smith     idt  = 3*i;
263f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
264f1af5d2fSBarry Smith     while (nz--) {
265f1af5d2fSBarry Smith       idx   = 3*(*vi--);
266f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
267f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
268f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269f1af5d2fSBarry Smith       v -= 9;
270f1af5d2fSBarry Smith     }
271f1af5d2fSBarry Smith   }
2721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
273dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
274f1af5d2fSBarry Smith   PetscFunctionReturn(0);
275f1af5d2fSBarry Smith }
276f1af5d2fSBarry Smith 
2774a2ae208SSatish Balay #undef __FUNCT__
2784dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
2794dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2808499736aSShri Abhyankar {
2818499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2828499736aSShri Abhyankar   PetscErrorCode    ierr;
283b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2848499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
285b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
286b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
287b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
2888499736aSShri Abhyankar 
2898499736aSShri Abhyankar   PetscFunctionBegin;
2908499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2918499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2928499736aSShri Abhyankar 
2938499736aSShri Abhyankar   /* forward solve the U^T */
2948499736aSShri Abhyankar   idx = 0;
2958499736aSShri Abhyankar   for (i=0; i<n; i++) {
2968499736aSShri Abhyankar     v     = aa + bs2*diag[i];
2978499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
2988499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
2998499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
3008499736aSShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
3018499736aSShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
3028499736aSShri Abhyankar     v -= bs2;
3038499736aSShri Abhyankar 
3048499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
3058499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
3068499736aSShri Abhyankar     for(j=0;j>-nz;j--){
3078499736aSShri Abhyankar       oidx = bs*vi[j];
3088499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3098499736aSShri Abhyankar       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3108499736aSShri Abhyankar       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3118499736aSShri Abhyankar       v  -= bs2;
3128499736aSShri Abhyankar     }
3138499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
3148499736aSShri Abhyankar     idx += bs;
3158499736aSShri Abhyankar   }
3168499736aSShri Abhyankar   /* backward solve the L^T */
3178499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
3188499736aSShri Abhyankar     v    = aa + bs2*ai[i];
3198499736aSShri Abhyankar     vi   = aj + ai[i];
3208499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
3218499736aSShri Abhyankar     idt  = bs*i;
3228499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
3238499736aSShri Abhyankar     for(j=0;j<nz;j++){
3248499736aSShri Abhyankar       idx   = bs*vi[j];
3258499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3268499736aSShri Abhyankar       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3278499736aSShri Abhyankar       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3288499736aSShri Abhyankar       v += bs2;
3298499736aSShri Abhyankar     }
3308499736aSShri Abhyankar   }
3318499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3328499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3338499736aSShri Abhyankar   PetscFunctionReturn(0);
3348499736aSShri Abhyankar }
3358499736aSShri Abhyankar 
3368499736aSShri Abhyankar #undef __FUNCT__
33706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
33806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339f1af5d2fSBarry Smith {
340f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
341dfbe8321SBarry Smith   PetscErrorCode    ierr;
342b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
344b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
345b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
346f1af5d2fSBarry Smith 
347f1af5d2fSBarry Smith   PetscFunctionBegin;
348ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3491ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
350f1af5d2fSBarry Smith 
351f1af5d2fSBarry Smith   /* forward solve the U^T */
352f1af5d2fSBarry Smith   idx = 0;
353f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
354f1af5d2fSBarry Smith 
355f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
356f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
357ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
358f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
359f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
360f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
361f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362f1af5d2fSBarry Smith     v += 16;
363f1af5d2fSBarry Smith 
364f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
365f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
366f1af5d2fSBarry Smith     while (nz--) {
367f1af5d2fSBarry Smith       oidx = 4*(*vi++);
368f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
369f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
370f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372f1af5d2fSBarry Smith       v  += 16;
373f1af5d2fSBarry Smith     }
374f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375f1af5d2fSBarry Smith     idx += 4;
376f1af5d2fSBarry Smith   }
377f1af5d2fSBarry Smith   /* backward solve the L^T */
378f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
379f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
380f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
381f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
382f1af5d2fSBarry Smith     idt  = 4*i;
383f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384f1af5d2fSBarry Smith     while (nz--) {
385f1af5d2fSBarry Smith       idx   = 4*(*vi--);
386f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390f1af5d2fSBarry Smith       v -= 16;
391f1af5d2fSBarry Smith     }
392f1af5d2fSBarry Smith   }
3931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
394dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
395f1af5d2fSBarry Smith   PetscFunctionReturn(0);
396f1af5d2fSBarry Smith }
397f1af5d2fSBarry Smith 
3984a2ae208SSatish Balay #undef __FUNCT__
3994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
4004dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4018499736aSShri Abhyankar {
4028499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4038499736aSShri Abhyankar   PetscErrorCode    ierr;
404b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
4058499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
406b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
407b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
408b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
4098499736aSShri Abhyankar 
4108499736aSShri Abhyankar   PetscFunctionBegin;
4118499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4128499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4138499736aSShri Abhyankar 
4148499736aSShri Abhyankar   /* forward solve the U^T */
4158499736aSShri Abhyankar   idx = 0;
4168499736aSShri Abhyankar   for (i=0; i<n; i++) {
4178499736aSShri Abhyankar     v     = aa + bs2*diag[i];
4188499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
4198499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
4208499736aSShri Abhyankar     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
4218499736aSShri Abhyankar     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
4228499736aSShri Abhyankar     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
4238499736aSShri Abhyankar     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
4248499736aSShri Abhyankar     v -= bs2;
4258499736aSShri Abhyankar 
4268499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
4278499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
4288499736aSShri Abhyankar     for(j=0;j>-nz;j--){
4298499736aSShri Abhyankar       oidx = bs*vi[j];
4308499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4318499736aSShri Abhyankar       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4328499736aSShri Abhyankar       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4338499736aSShri Abhyankar       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4348499736aSShri Abhyankar       v  -= bs2;
4358499736aSShri Abhyankar     }
4368499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
4378499736aSShri Abhyankar     idx += bs;
4388499736aSShri Abhyankar   }
4398499736aSShri Abhyankar   /* backward solve the L^T */
4408499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
4418499736aSShri Abhyankar     v    = aa + bs2*ai[i];
4428499736aSShri Abhyankar     vi   = aj + ai[i];
4438499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
4448499736aSShri Abhyankar     idt  = bs*i;
4458499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
4468499736aSShri Abhyankar     for(j=0;j<nz;j++){
4478499736aSShri Abhyankar       idx   = bs*vi[j];
4488499736aSShri Abhyankar       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4498499736aSShri Abhyankar       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4508499736aSShri Abhyankar       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4518499736aSShri Abhyankar       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4528499736aSShri Abhyankar       v += bs2;
4538499736aSShri Abhyankar     }
4548499736aSShri Abhyankar   }
4558499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4568499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4578499736aSShri Abhyankar   PetscFunctionReturn(0);
4588499736aSShri Abhyankar }
4598499736aSShri Abhyankar 
4608499736aSShri Abhyankar #undef __FUNCT__
46106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
46206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463f1af5d2fSBarry Smith {
464f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
465dfbe8321SBarry Smith   PetscErrorCode    ierr;
466b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
468b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
469b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
470f1af5d2fSBarry Smith 
471f1af5d2fSBarry Smith   PetscFunctionBegin;
472ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474f1af5d2fSBarry Smith 
475f1af5d2fSBarry Smith   /* forward solve the U^T */
476f1af5d2fSBarry Smith   idx = 0;
477f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
478f1af5d2fSBarry Smith 
479f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
480f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
481ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
483f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
484f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487f1af5d2fSBarry Smith     v += 25;
488f1af5d2fSBarry Smith 
489f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
490f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
491f1af5d2fSBarry Smith     while (nz--) {
492f1af5d2fSBarry Smith       oidx = 5*(*vi++);
493f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
494f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
495f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498f1af5d2fSBarry Smith       v  += 25;
499f1af5d2fSBarry Smith     }
500f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501f1af5d2fSBarry Smith     idx += 5;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
505f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
506f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
508f1af5d2fSBarry Smith     idt  = 5*i;
509f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510f1af5d2fSBarry Smith     while (nz--) {
511f1af5d2fSBarry Smith       idx   = 5*(*vi--);
512f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
513f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
514f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517f1af5d2fSBarry Smith       v -= 25;
518f1af5d2fSBarry Smith     }
519f1af5d2fSBarry Smith   }
5201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
522f1af5d2fSBarry Smith   PetscFunctionReturn(0);
523f1af5d2fSBarry Smith }
524f1af5d2fSBarry Smith 
5254a2ae208SSatish Balay #undef __FUNCT__
5264dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
5274dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
5288499736aSShri Abhyankar {
5298499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
5308499736aSShri Abhyankar   PetscErrorCode ierr;
531b3260449SShri Abhyankar   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5328499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
533b3260449SShri Abhyankar   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
534b3260449SShri Abhyankar   const MatScalar      *aa=a->a,*v;
535b3260449SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
5368499736aSShri Abhyankar 
5378499736aSShri Abhyankar   PetscFunctionBegin;
5388499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
5398499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5408499736aSShri Abhyankar 
5418499736aSShri Abhyankar   /* forward solve the U^T */
5428499736aSShri Abhyankar   idx = 0;
5438499736aSShri Abhyankar   for (i=0; i<n; i++) {
5448499736aSShri Abhyankar     v     = aa + bs2*diag[i];
5458499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
5468499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
5478499736aSShri Abhyankar     x5 = x[4+idx];
5488499736aSShri Abhyankar     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
5498499736aSShri Abhyankar     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
5508499736aSShri Abhyankar     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
5518499736aSShri Abhyankar     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
5528499736aSShri Abhyankar     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
5538499736aSShri Abhyankar     v -= bs2;
5548499736aSShri Abhyankar 
5558499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
5568499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
5578499736aSShri Abhyankar     for(j=0;j>-nz;j--){
5588499736aSShri Abhyankar       oidx = bs*vi[j];
5598499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5608499736aSShri Abhyankar       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5618499736aSShri Abhyankar       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5628499736aSShri Abhyankar       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5638499736aSShri Abhyankar       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5648499736aSShri Abhyankar       v  -= bs2;
5658499736aSShri Abhyankar     }
5668499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
5678499736aSShri Abhyankar     idx += bs;
5688499736aSShri Abhyankar   }
5698499736aSShri Abhyankar   /* backward solve the L^T */
5708499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
5718499736aSShri Abhyankar     v    = aa + bs2*ai[i];
5728499736aSShri Abhyankar     vi   = aj + ai[i];
5738499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
5748499736aSShri Abhyankar     idt  = bs*i;
5758499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
5768499736aSShri Abhyankar     for(j=0;j<nz;j++){
5778499736aSShri Abhyankar       idx   = bs*vi[j];
5788499736aSShri Abhyankar       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5798499736aSShri Abhyankar       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5808499736aSShri Abhyankar       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5818499736aSShri Abhyankar       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5828499736aSShri Abhyankar       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5838499736aSShri Abhyankar       v += bs2;
5848499736aSShri Abhyankar     }
5858499736aSShri Abhyankar   }
5868499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5878499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5888499736aSShri Abhyankar   PetscFunctionReturn(0);
5898499736aSShri Abhyankar }
5908499736aSShri Abhyankar 
5918499736aSShri Abhyankar #undef __FUNCT__
59206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
59306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594f1af5d2fSBarry Smith {
595f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
596dfbe8321SBarry Smith   PetscErrorCode    ierr;
597b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
599b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
600b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
601f1af5d2fSBarry Smith 
602f1af5d2fSBarry Smith   PetscFunctionBegin;
603ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
6041ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
605f1af5d2fSBarry Smith 
606f1af5d2fSBarry Smith   /* forward solve the U^T */
607f1af5d2fSBarry Smith   idx = 0;
608f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
609f1af5d2fSBarry Smith 
610f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
611f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
612ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613ef66eb69SBarry Smith     x6    = x[5+idx];
614f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
615f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
616f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620f1af5d2fSBarry Smith     v += 36;
621f1af5d2fSBarry Smith 
622f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
623f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
624f1af5d2fSBarry Smith     while (nz--) {
625f1af5d2fSBarry Smith       oidx = 6*(*vi++);
626f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632f1af5d2fSBarry Smith       v  += 36;
633f1af5d2fSBarry Smith     }
634f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635f1af5d2fSBarry Smith     x[5+idx] = s6;
636f1af5d2fSBarry Smith     idx += 6;
637f1af5d2fSBarry Smith   }
638f1af5d2fSBarry Smith   /* backward solve the L^T */
639f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
640f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
641f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
642f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
643f1af5d2fSBarry Smith     idt  = 6*i;
644f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645f1af5d2fSBarry Smith     s6 = x[5+idt];
646f1af5d2fSBarry Smith     while (nz--) {
647f1af5d2fSBarry Smith       idx   = 6*(*vi--);
648f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
649f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
650f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654f1af5d2fSBarry Smith       v -= 36;
655f1af5d2fSBarry Smith     }
656f1af5d2fSBarry Smith   }
6571ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
658dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
659f1af5d2fSBarry Smith   PetscFunctionReturn(0);
660f1af5d2fSBarry Smith }
661f1af5d2fSBarry Smith 
6624a2ae208SSatish Balay #undef __FUNCT__
6634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
6644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
6658499736aSShri Abhyankar {
6668499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
6678499736aSShri Abhyankar   PetscErrorCode    ierr;
668b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
6698499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
670b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
671b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
672b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
6738499736aSShri Abhyankar 
6748499736aSShri Abhyankar   PetscFunctionBegin;
6758499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
6768499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
6778499736aSShri Abhyankar 
6788499736aSShri Abhyankar   /* forward solve the U^T */
6798499736aSShri Abhyankar   idx = 0;
6808499736aSShri Abhyankar   for (i=0; i<n; i++) {
6818499736aSShri Abhyankar     v     = aa + bs2*diag[i];
6828499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
6838499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
6848499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];
6858499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
6868499736aSShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
6878499736aSShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
6888499736aSShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
6898499736aSShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
6908499736aSShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
6918499736aSShri Abhyankar     v -= bs2;
6928499736aSShri Abhyankar 
6938499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
6948499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
6958499736aSShri Abhyankar     for(j=0;j>-nz;j--){
6968499736aSShri Abhyankar       oidx = bs*vi[j];
6978499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
6988499736aSShri Abhyankar       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
6998499736aSShri Abhyankar       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7008499736aSShri Abhyankar       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7018499736aSShri Abhyankar       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7028499736aSShri Abhyankar       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7038499736aSShri Abhyankar       v  -= bs2;
7048499736aSShri Abhyankar     }
7058499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
7068499736aSShri Abhyankar     x[5+idx] = s6;
7078499736aSShri Abhyankar     idx += bs;
7088499736aSShri Abhyankar   }
7098499736aSShri Abhyankar   /* backward solve the L^T */
7108499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
7118499736aSShri Abhyankar     v    = aa + bs2*ai[i];
7128499736aSShri Abhyankar     vi   = aj + ai[i];
7138499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
7148499736aSShri Abhyankar     idt  = bs*i;
7158499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
7168499736aSShri Abhyankar     s6   = x[5+idt];
7178499736aSShri Abhyankar     for(j=0;j<nz;j++){
7188499736aSShri Abhyankar       idx   = bs*vi[j];
7198499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
7208499736aSShri Abhyankar       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
7218499736aSShri Abhyankar       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7228499736aSShri Abhyankar       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7238499736aSShri Abhyankar       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7248499736aSShri Abhyankar       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7258499736aSShri Abhyankar       v += bs2;
7268499736aSShri Abhyankar     }
7278499736aSShri Abhyankar   }
7288499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
7298499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
7308499736aSShri Abhyankar   PetscFunctionReturn(0);
7318499736aSShri Abhyankar }
7328499736aSShri Abhyankar 
7338499736aSShri Abhyankar #undef __FUNCT__
73406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
73506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736f1af5d2fSBarry Smith {
737f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
738dfbe8321SBarry Smith   PetscErrorCode    ierr;
739b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
741b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
742b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
743f1af5d2fSBarry Smith 
744f1af5d2fSBarry Smith   PetscFunctionBegin;
745ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
7461ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith   /* forward solve the U^T */
749f1af5d2fSBarry Smith   idx = 0;
750f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
751f1af5d2fSBarry Smith 
752f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
753f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
754ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
756f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
757f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763f1af5d2fSBarry Smith     v += 49;
764f1af5d2fSBarry Smith 
765f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
766f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
767f1af5d2fSBarry Smith     while (nz--) {
768f1af5d2fSBarry Smith       oidx = 7*(*vi++);
769f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
770f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776f1af5d2fSBarry Smith       v  += 49;
777f1af5d2fSBarry Smith     }
778f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
780f1af5d2fSBarry Smith     idx += 7;
781f1af5d2fSBarry Smith   }
782f1af5d2fSBarry Smith   /* backward solve the L^T */
783f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
784f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
785f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
786f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
787f1af5d2fSBarry Smith     idt  = 7*i;
788f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
790f1af5d2fSBarry Smith     while (nz--) {
791f1af5d2fSBarry Smith       idx   = 7*(*vi--);
792f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
793f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799f1af5d2fSBarry Smith       v -= 49;
800f1af5d2fSBarry Smith     }
801f1af5d2fSBarry Smith   }
8021ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
803dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
804f1af5d2fSBarry Smith   PetscFunctionReturn(0);
805f1af5d2fSBarry Smith }
8068499736aSShri Abhyankar #undef __FUNCT__
8074dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
8084dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
8098499736aSShri Abhyankar {
8108499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
8118499736aSShri Abhyankar   PetscErrorCode    ierr;
812b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
8138499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
814b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
815b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
816b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
8178499736aSShri Abhyankar 
8188499736aSShri Abhyankar   PetscFunctionBegin;
8198499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
8208499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
8218499736aSShri Abhyankar 
8228499736aSShri Abhyankar   /* forward solve the U^T */
8238499736aSShri Abhyankar   idx = 0;
8248499736aSShri Abhyankar   for (i=0; i<n; i++) {
8258499736aSShri Abhyankar     v     = aa + bs2*diag[i];
8268499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
8278499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
8288499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
8298499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
8308499736aSShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
8318499736aSShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
8328499736aSShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
8338499736aSShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
8348499736aSShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
8358499736aSShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
8368499736aSShri Abhyankar     v -= bs2;
8378499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
8388499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
8398499736aSShri Abhyankar     for(j=0;j>-nz;j--){
8408499736aSShri Abhyankar       oidx = bs*vi[j];
8418499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8428499736aSShri Abhyankar       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8438499736aSShri Abhyankar       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8448499736aSShri Abhyankar       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8458499736aSShri Abhyankar       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8468499736aSShri Abhyankar       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8478499736aSShri Abhyankar       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8488499736aSShri Abhyankar       v  -= bs2;
8498499736aSShri Abhyankar     }
8508499736aSShri Abhyankar     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
8518499736aSShri Abhyankar     x[5+idx] = s6;  x[6+idx] = s7;
8528499736aSShri Abhyankar     idx += bs;
8538499736aSShri Abhyankar   }
8548499736aSShri Abhyankar   /* backward solve the L^T */
8558499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
8568499736aSShri Abhyankar     v    = aa + bs2*ai[i];
8578499736aSShri Abhyankar     vi   = aj + ai[i];
8588499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
8598499736aSShri Abhyankar     idt  = bs*i;
8608499736aSShri Abhyankar     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
8618499736aSShri Abhyankar     s6   = x[5+idt];  s7 = x[6+idt];
8628499736aSShri Abhyankar     for(j=0;j<nz;j++){
8638499736aSShri Abhyankar       idx   = bs*vi[j];
8648499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8658499736aSShri Abhyankar       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8668499736aSShri Abhyankar       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8678499736aSShri Abhyankar       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8688499736aSShri Abhyankar       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8698499736aSShri Abhyankar       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8708499736aSShri Abhyankar       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8718499736aSShri Abhyankar       v += bs2;
8728499736aSShri Abhyankar     }
8738499736aSShri Abhyankar   }
8748499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
8758499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
8768499736aSShri Abhyankar   PetscFunctionReturn(0);
8778499736aSShri Abhyankar }
878f1af5d2fSBarry Smith 
879f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
8804a2ae208SSatish Balay #undef __FUNCT__
88193fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
88293fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
88393fd935bSShri Abhyankar {
88493fd935bSShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
88593fd935bSShri Abhyankar   IS                iscol = a->col,isrow = a->row;
88693fd935bSShri Abhyankar   PetscErrorCode    ierr;
88793fd935bSShri Abhyankar   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
88893fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
88993fd935bSShri Abhyankar   PetscInt          nz;
89093fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
89193fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
89293fd935bSShri Abhyankar   const PetscScalar *b;
89393fd935bSShri Abhyankar 
89493fd935bSShri Abhyankar   PetscFunctionBegin;
8953649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
89693fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
89793fd935bSShri Abhyankar   tmp  = a->solve_work;
89893fd935bSShri Abhyankar 
89993fd935bSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
90093fd935bSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
90193fd935bSShri Abhyankar 
90293fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
90393fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[c[i]];
90493fd935bSShri Abhyankar 
90593fd935bSShri Abhyankar   /* forward solve the U^T */
90693fd935bSShri Abhyankar   for (i=0; i<n; i++) {
90793fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
90893fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
90993fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
91093fd935bSShri Abhyankar     s1  = tmp[i];
91193fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
91293fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
91393fd935bSShri Abhyankar     tmp[i] = s1;
91493fd935bSShri Abhyankar   }
91593fd935bSShri Abhyankar 
91693fd935bSShri Abhyankar   /* backward solve the L^T */
91793fd935bSShri Abhyankar   for (i=n-1; i>=0; i--){
91893fd935bSShri Abhyankar     v   = aa + ai[i];
91993fd935bSShri Abhyankar     vi  = aj + ai[i];
92093fd935bSShri Abhyankar     nz  = ai[i+1] - ai[i];
92193fd935bSShri Abhyankar     s1  = tmp[i];
92293fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
92393fd935bSShri Abhyankar   }
92493fd935bSShri Abhyankar 
92593fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
92693fd935bSShri Abhyankar   for (i=0; i<n; i++) x[r[i]] = tmp[i];
92793fd935bSShri Abhyankar 
92893fd935bSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
92993fd935bSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9303649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
93193fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
93293fd935bSShri Abhyankar 
93393fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
93493fd935bSShri Abhyankar   PetscFunctionReturn(0);
93593fd935bSShri Abhyankar }
93693fd935bSShri Abhyankar 
93793fd935bSShri Abhyankar #undef __FUNCT__
93806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
93906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940f1af5d2fSBarry Smith {
941f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
942f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
9436849ba73SBarry Smith   PetscErrorCode    ierr;
9445d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
945b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946b3260449SShri Abhyankar   PetscInt          i,nz;
947b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
948b3260449SShri Abhyankar   PetscScalar       s1,*x,*t;
949b3260449SShri Abhyankar   const PetscScalar *b;
950f1af5d2fSBarry Smith 
951f1af5d2fSBarry Smith   PetscFunctionBegin;
9523649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
9531ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
954f1af5d2fSBarry Smith   t  = a->solve_work;
955f1af5d2fSBarry Smith 
956f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
957f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
958f1af5d2fSBarry Smith 
959f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
960f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
961f1af5d2fSBarry Smith     t[i] = b[c[i]];
962f1af5d2fSBarry Smith   }
963f1af5d2fSBarry Smith 
964f1af5d2fSBarry Smith   /* forward solve the U^T */
965f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
966f1af5d2fSBarry Smith 
967f1af5d2fSBarry Smith     v     = aa + diag[i];
968f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
969f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
970f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
971f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
972f1af5d2fSBarry Smith     while (nz--) {
973f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
974f1af5d2fSBarry Smith     }
975f1af5d2fSBarry Smith     t[i]   = s1;
976f1af5d2fSBarry Smith   }
977f1af5d2fSBarry Smith   /* backward solve the L^T */
978f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
979f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
980f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
981f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
982f1af5d2fSBarry Smith     s1   = t[i];
983f1af5d2fSBarry Smith     while (nz--) {
984f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
985f1af5d2fSBarry Smith     }
986f1af5d2fSBarry Smith   }
987f1af5d2fSBarry Smith 
988f1af5d2fSBarry Smith   /* copy t into x according to permutation */
989f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
990f1af5d2fSBarry Smith     x[r[i]]   = t[i];
991f1af5d2fSBarry Smith   }
992f1af5d2fSBarry Smith 
993f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
994f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9953649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
9961ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
997dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   PetscFunctionReturn(0);
999f1af5d2fSBarry Smith }
1000f1af5d2fSBarry Smith 
10014a2ae208SSatish Balay #undef __FUNCT__
100206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
100306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1004f1af5d2fSBarry Smith {
1005f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1006f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
10076849ba73SBarry Smith   PetscErrorCode    ierr;
10085d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1009b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1010b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1011b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1012b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1013b3260449SShri Abhyankar   const PetscScalar *b;
1014f1af5d2fSBarry Smith 
1015f1af5d2fSBarry Smith   PetscFunctionBegin;
10163649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
10171ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1018f1af5d2fSBarry Smith   t  = a->solve_work;
1019f1af5d2fSBarry Smith 
1020f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1021f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1022f1af5d2fSBarry Smith 
1023f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1024f1af5d2fSBarry Smith   ii = 0;
1025f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1026f1af5d2fSBarry Smith     ic      = 2*c[i];
1027f1af5d2fSBarry Smith     t[ii]   = b[ic];
1028f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1029f1af5d2fSBarry Smith     ii += 2;
1030f1af5d2fSBarry Smith   }
1031f1af5d2fSBarry Smith 
1032f1af5d2fSBarry Smith   /* forward solve the U^T */
1033f1af5d2fSBarry Smith   idx = 0;
1034f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1035f1af5d2fSBarry Smith 
1036f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
1037f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1038f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
1039f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
1040f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
1041f1af5d2fSBarry Smith     v += 4;
1042f1af5d2fSBarry Smith 
1043f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1044f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1045f1af5d2fSBarry Smith     while (nz--) {
1046f1af5d2fSBarry Smith       oidx = 2*(*vi++);
1047f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1048f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1049f1af5d2fSBarry Smith       v  += 4;
1050f1af5d2fSBarry Smith     }
1051f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1052f1af5d2fSBarry Smith     idx += 2;
1053f1af5d2fSBarry Smith   }
1054f1af5d2fSBarry Smith   /* backward solve the L^T */
1055f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1056f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
1057f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1058f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1059f1af5d2fSBarry Smith     idt  = 2*i;
1060f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1061f1af5d2fSBarry Smith     while (nz--) {
1062f1af5d2fSBarry Smith       idx   = 2*(*vi--);
1063f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1064f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1065f1af5d2fSBarry Smith       v -= 4;
1066f1af5d2fSBarry Smith     }
1067f1af5d2fSBarry Smith   }
1068f1af5d2fSBarry Smith 
1069f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1070f1af5d2fSBarry Smith   ii = 0;
1071f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1072f1af5d2fSBarry Smith     ir      = 2*r[i];
1073f1af5d2fSBarry Smith     x[ir]   = t[ii];
1074f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1075f1af5d2fSBarry Smith     ii += 2;
1076f1af5d2fSBarry Smith   }
1077f1af5d2fSBarry Smith 
1078f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1079f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
10803649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
10811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1082dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1083f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1084f1af5d2fSBarry Smith }
1085f1af5d2fSBarry Smith 
10864a2ae208SSatish Balay #undef __FUNCT__
10874dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
10884dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
108932121132SShri Abhyankar {
109032121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
109132121132SShri Abhyankar   PetscErrorCode    ierr;
109232121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1093b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
109432121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
109532121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1096b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1097b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1098b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1099b3260449SShri Abhyankar   const PetscScalar *b;
110032121132SShri Abhyankar 
110132121132SShri Abhyankar   PetscFunctionBegin;
11023649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
110332121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
110432121132SShri Abhyankar   t = a->solve_work;
110532121132SShri Abhyankar 
110632121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
110732121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
110832121132SShri Abhyankar 
110932121132SShri Abhyankar   /* copy b into temp work space according to permutation */
111032121132SShri Abhyankar   for(i=0;i<n;i++){
111132121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
111232121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1];
111332121132SShri Abhyankar   }
111432121132SShri Abhyankar 
111532121132SShri Abhyankar   /* forward solve the U^T */
111632121132SShri Abhyankar   idx = 0;
111732121132SShri Abhyankar   for (i=0; i<n; i++) {
111832121132SShri Abhyankar     v     = aa + bs2*diag[i];
111932121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
112032121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx];
112132121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
112232121132SShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
112332121132SShri Abhyankar     v -= bs2;
112432121132SShri Abhyankar 
112532121132SShri Abhyankar     vi    = aj + diag[i] - 1;
112632121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
112732121132SShri Abhyankar     for(j=0;j>-nz;j--){
112832121132SShri Abhyankar       oidx = bs*vi[j];
112932121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2;
113032121132SShri Abhyankar       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
113132121132SShri Abhyankar       v  -= bs2;
113232121132SShri Abhyankar     }
113332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
113432121132SShri Abhyankar     idx += bs;
113532121132SShri Abhyankar   }
113632121132SShri Abhyankar   /* backward solve the L^T */
113732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
113832121132SShri Abhyankar     v    = aa + bs2*ai[i];
113932121132SShri Abhyankar     vi   = aj + ai[i];
114032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
114132121132SShri Abhyankar     idt  = bs*i;
114232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];
114332121132SShri Abhyankar     for(j=0;j<nz;j++){
114432121132SShri Abhyankar       idx   = bs*vi[j];
114532121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2;
114632121132SShri Abhyankar       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
114732121132SShri Abhyankar       v += bs2;
114832121132SShri Abhyankar     }
114932121132SShri Abhyankar   }
115032121132SShri Abhyankar 
115132121132SShri Abhyankar   /* copy t into x according to permutation */
115232121132SShri Abhyankar   for(i=0;i<n;i++){
115332121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
115432121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1];
115532121132SShri Abhyankar   }
115632121132SShri Abhyankar 
115732121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
115832121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11593649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
116032121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
116132121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
116232121132SShri Abhyankar   PetscFunctionReturn(0);
116332121132SShri Abhyankar }
116432121132SShri Abhyankar 
116532121132SShri Abhyankar #undef __FUNCT__
116606e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
116706e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1168f1af5d2fSBarry Smith {
1169f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1170f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
11716849ba73SBarry Smith   PetscErrorCode    ierr;
11725d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1173b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1174b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1175b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1176b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1177b3260449SShri Abhyankar   const PetscScalar *b;
1178f1af5d2fSBarry Smith 
1179f1af5d2fSBarry Smith   PetscFunctionBegin;
11803649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
11811ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1182f1af5d2fSBarry Smith   t  = a->solve_work;
1183f1af5d2fSBarry Smith 
1184f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1185f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1186f1af5d2fSBarry Smith 
1187f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1188f1af5d2fSBarry Smith   ii = 0;
1189f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1190f1af5d2fSBarry Smith     ic      = 3*c[i];
1191f1af5d2fSBarry Smith     t[ii]   = b[ic];
1192f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1193f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1194f1af5d2fSBarry Smith     ii += 3;
1195f1af5d2fSBarry Smith   }
1196f1af5d2fSBarry Smith 
1197f1af5d2fSBarry Smith   /* forward solve the U^T */
1198f1af5d2fSBarry Smith   idx = 0;
1199f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1200f1af5d2fSBarry Smith 
1201f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
1202f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1203f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1204f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1205f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1206f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1207f1af5d2fSBarry Smith     v += 9;
1208f1af5d2fSBarry Smith 
1209f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1210f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1211f1af5d2fSBarry Smith     while (nz--) {
1212f1af5d2fSBarry Smith       oidx = 3*(*vi++);
1213f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1214f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1215f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1216f1af5d2fSBarry Smith       v  += 9;
1217f1af5d2fSBarry Smith     }
1218f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1219f1af5d2fSBarry Smith     idx += 3;
1220f1af5d2fSBarry Smith   }
1221f1af5d2fSBarry Smith   /* backward solve the L^T */
1222f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1223f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
1224f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1225f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1226f1af5d2fSBarry Smith     idt  = 3*i;
1227f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1228f1af5d2fSBarry Smith     while (nz--) {
1229f1af5d2fSBarry Smith       idx   = 3*(*vi--);
1230f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1231f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1232f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233f1af5d2fSBarry Smith       v -= 9;
1234f1af5d2fSBarry Smith     }
1235f1af5d2fSBarry Smith   }
1236f1af5d2fSBarry Smith 
1237f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1238f1af5d2fSBarry Smith   ii = 0;
1239f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1240f1af5d2fSBarry Smith     ir      = 3*r[i];
1241f1af5d2fSBarry Smith     x[ir]   = t[ii];
1242f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1243f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1244f1af5d2fSBarry Smith     ii += 3;
1245f1af5d2fSBarry Smith   }
1246f1af5d2fSBarry Smith 
1247f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1248f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12493649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
12501ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1251dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1252f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1253f1af5d2fSBarry Smith }
1254f1af5d2fSBarry Smith 
12554a2ae208SSatish Balay #undef __FUNCT__
12564dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
12574dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
125832121132SShri Abhyankar {
125932121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
126032121132SShri Abhyankar   PetscErrorCode    ierr;
126132121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1262b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
126332121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
126432121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1265b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1266b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1267b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1268b3260449SShri Abhyankar   const PetscScalar *b;
126932121132SShri Abhyankar 
127032121132SShri Abhyankar   PetscFunctionBegin;
12713649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
127232121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
127332121132SShri Abhyankar   t = a->solve_work;
127432121132SShri Abhyankar 
127532121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
127632121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
127732121132SShri Abhyankar 
127832121132SShri Abhyankar   /* copy b into temp work space according to permutation */
127932121132SShri Abhyankar   for(i=0;i<n;i++){
128032121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
128132121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
128232121132SShri Abhyankar   }
128332121132SShri Abhyankar 
128432121132SShri Abhyankar   /* forward solve the U^T */
128532121132SShri Abhyankar   idx = 0;
128632121132SShri Abhyankar   for (i=0; i<n; i++) {
128732121132SShri Abhyankar     v     = aa + bs2*diag[i];
128832121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
128932121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
129032121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
129132121132SShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
129232121132SShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
129332121132SShri Abhyankar     v -= bs2;
129432121132SShri Abhyankar 
129532121132SShri Abhyankar     vi    = aj + diag[i] - 1;
129632121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
129732121132SShri Abhyankar     for(j=0;j>-nz;j--){
129832121132SShri Abhyankar       oidx = bs*vi[j];
129932121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
130032121132SShri Abhyankar       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
130132121132SShri Abhyankar       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
130232121132SShri Abhyankar       v  -= bs2;
130332121132SShri Abhyankar     }
130432121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
130532121132SShri Abhyankar     idx += bs;
130632121132SShri Abhyankar   }
130732121132SShri Abhyankar   /* backward solve the L^T */
130832121132SShri Abhyankar   for (i=n-1; i>=0; i--){
130932121132SShri Abhyankar     v    = aa + bs2*ai[i];
131032121132SShri Abhyankar     vi   = aj + ai[i];
131132121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
131232121132SShri Abhyankar     idt  = bs*i;
131332121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
131432121132SShri Abhyankar     for(j=0;j<nz;j++){
131532121132SShri Abhyankar       idx   = bs*vi[j];
131632121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
131732121132SShri Abhyankar       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
131832121132SShri Abhyankar       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
131932121132SShri Abhyankar       v += bs2;
132032121132SShri Abhyankar     }
132132121132SShri Abhyankar   }
132232121132SShri Abhyankar 
132332121132SShri Abhyankar   /* copy t into x according to permutation */
132432121132SShri Abhyankar   for(i=0;i<n;i++){
132532121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
132632121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
132732121132SShri Abhyankar   }
132832121132SShri Abhyankar 
132932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
133032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13313649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
133232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
133332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
133432121132SShri Abhyankar   PetscFunctionReturn(0);
133532121132SShri Abhyankar }
133632121132SShri Abhyankar 
133732121132SShri Abhyankar #undef __FUNCT__
133806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
133906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1340f1af5d2fSBarry Smith {
1341f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1342f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
13436849ba73SBarry Smith   PetscErrorCode    ierr;
13445d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1345b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1346b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1347b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1348b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1349b3260449SShri Abhyankar   const PetscScalar *b;
1350f1af5d2fSBarry Smith 
1351f1af5d2fSBarry Smith   PetscFunctionBegin;
13523649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
13531ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354f1af5d2fSBarry Smith   t  = a->solve_work;
1355f1af5d2fSBarry Smith 
1356f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1357f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1358f1af5d2fSBarry Smith 
1359f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1360f1af5d2fSBarry Smith   ii = 0;
1361f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1362f1af5d2fSBarry Smith     ic      = 4*c[i];
1363f1af5d2fSBarry Smith     t[ii]   = b[ic];
1364f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1365f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1366f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1367f1af5d2fSBarry Smith     ii += 4;
1368f1af5d2fSBarry Smith   }
1369f1af5d2fSBarry Smith 
1370f1af5d2fSBarry Smith   /* forward solve the U^T */
1371f1af5d2fSBarry Smith   idx = 0;
1372f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1373f1af5d2fSBarry Smith 
1374f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
1375f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1376f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1377f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1378f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1379f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1380f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1381f1af5d2fSBarry Smith     v += 16;
1382f1af5d2fSBarry Smith 
1383f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1384f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1385f1af5d2fSBarry Smith     while (nz--) {
1386f1af5d2fSBarry Smith       oidx = 4*(*vi++);
1387f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1388f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1389f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1390f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1391f1af5d2fSBarry Smith       v  += 16;
1392f1af5d2fSBarry Smith     }
1393f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1394f1af5d2fSBarry Smith     idx += 4;
1395f1af5d2fSBarry Smith   }
1396f1af5d2fSBarry Smith   /* backward solve the L^T */
1397f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1398f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
1399f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1400f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1401f1af5d2fSBarry Smith     idt  = 4*i;
1402f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1403f1af5d2fSBarry Smith     while (nz--) {
1404f1af5d2fSBarry Smith       idx   = 4*(*vi--);
1405f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1406f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1407f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1408f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1409f1af5d2fSBarry Smith       v -= 16;
1410f1af5d2fSBarry Smith     }
1411f1af5d2fSBarry Smith   }
1412f1af5d2fSBarry Smith 
1413f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1414f1af5d2fSBarry Smith   ii = 0;
1415f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1416f1af5d2fSBarry Smith     ir      = 4*r[i];
1417f1af5d2fSBarry Smith     x[ir]   = t[ii];
1418f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1419f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1420f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1421f1af5d2fSBarry Smith     ii += 4;
1422f1af5d2fSBarry Smith   }
1423f1af5d2fSBarry Smith 
1424f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1425f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
14263649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
14271ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1428dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1429f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1430f1af5d2fSBarry Smith }
1431f1af5d2fSBarry Smith 
14324a2ae208SSatish Balay #undef __FUNCT__
14334dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
14344dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
143532121132SShri Abhyankar {
143632121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
143732121132SShri Abhyankar   PetscErrorCode    ierr;
143832121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1439b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
144032121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
144132121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1442b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1443b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1444b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1445b3260449SShri Abhyankar   const PetscScalar *b;
144632121132SShri Abhyankar 
144732121132SShri Abhyankar   PetscFunctionBegin;
14483649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
144932121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
145032121132SShri Abhyankar   t = a->solve_work;
145132121132SShri Abhyankar 
145232121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
145332121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
145432121132SShri Abhyankar 
145532121132SShri Abhyankar   /* copy b into temp work space according to permutation */
145632121132SShri Abhyankar   for(i=0;i<n;i++){
145732121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
145832121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
145932121132SShri Abhyankar   }
146032121132SShri Abhyankar 
146132121132SShri Abhyankar   /* forward solve the U^T */
146232121132SShri Abhyankar   idx = 0;
146332121132SShri Abhyankar   for (i=0; i<n; i++) {
146432121132SShri Abhyankar     v     = aa + bs2*diag[i];
146532121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
146632121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
146732121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
146832121132SShri Abhyankar     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
146932121132SShri Abhyankar     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
147032121132SShri Abhyankar     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
147132121132SShri Abhyankar     v -= bs2;
147232121132SShri Abhyankar 
147332121132SShri Abhyankar     vi    = aj + diag[i] - 1;
147432121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
147532121132SShri Abhyankar     for(j=0;j>-nz;j--){
147632121132SShri Abhyankar       oidx = bs*vi[j];
147732121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
147832121132SShri Abhyankar       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
147932121132SShri Abhyankar       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
148032121132SShri Abhyankar       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
148132121132SShri Abhyankar       v  -= bs2;
148232121132SShri Abhyankar     }
148332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
148432121132SShri Abhyankar     idx += bs;
148532121132SShri Abhyankar   }
148632121132SShri Abhyankar   /* backward solve the L^T */
148732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
148832121132SShri Abhyankar     v    = aa + bs2*ai[i];
148932121132SShri Abhyankar     vi   = aj + ai[i];
149032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
149132121132SShri Abhyankar     idt  = bs*i;
149232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
149332121132SShri Abhyankar     for(j=0;j<nz;j++){
149432121132SShri Abhyankar       idx   = bs*vi[j];
149532121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
149632121132SShri Abhyankar       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
149732121132SShri Abhyankar       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
149832121132SShri Abhyankar       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
149932121132SShri Abhyankar       v += bs2;
150032121132SShri Abhyankar     }
150132121132SShri Abhyankar   }
150232121132SShri Abhyankar 
150332121132SShri Abhyankar   /* copy t into x according to permutation */
150432121132SShri Abhyankar   for(i=0;i<n;i++){
150532121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
150632121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
150732121132SShri Abhyankar   }
150832121132SShri Abhyankar 
150932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
151032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
15113649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
151232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
151332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
151432121132SShri Abhyankar   PetscFunctionReturn(0);
151532121132SShri Abhyankar }
151632121132SShri Abhyankar 
151732121132SShri Abhyankar #undef __FUNCT__
151806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
151906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1520f1af5d2fSBarry Smith {
1521f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1522f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
15236849ba73SBarry Smith   PetscErrorCode    ierr;
15245d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1525b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1526b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1527b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1528b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1529b3260449SShri Abhyankar   const PetscScalar *b;
1530f1af5d2fSBarry Smith 
1531f1af5d2fSBarry Smith   PetscFunctionBegin;
15323649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
15331ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534f1af5d2fSBarry Smith   t  = a->solve_work;
1535f1af5d2fSBarry Smith 
1536f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1537f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1538f1af5d2fSBarry Smith 
1539f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1540f1af5d2fSBarry Smith   ii = 0;
1541f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1542f1af5d2fSBarry Smith     ic      = 5*c[i];
1543f1af5d2fSBarry Smith     t[ii]   = b[ic];
1544f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1545f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1546f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1547f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1548f1af5d2fSBarry Smith     ii += 5;
1549f1af5d2fSBarry Smith   }
1550f1af5d2fSBarry Smith 
1551f1af5d2fSBarry Smith   /* forward solve the U^T */
1552f1af5d2fSBarry Smith   idx = 0;
1553f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1554f1af5d2fSBarry Smith 
1555f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
1556f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1557f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1558f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1559f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1560f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1561f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1562f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1563f1af5d2fSBarry Smith     v += 25;
1564f1af5d2fSBarry Smith 
1565f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1566f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1567f1af5d2fSBarry Smith     while (nz--) {
1568f1af5d2fSBarry Smith       oidx = 5*(*vi++);
1569f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1570f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1571f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1572f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1573f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1574f1af5d2fSBarry Smith       v  += 25;
1575f1af5d2fSBarry Smith     }
1576f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1577f1af5d2fSBarry Smith     idx += 5;
1578f1af5d2fSBarry Smith   }
1579f1af5d2fSBarry Smith   /* backward solve the L^T */
1580f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1581f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
1582f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1583f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1584f1af5d2fSBarry Smith     idt  = 5*i;
1585f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1586f1af5d2fSBarry Smith     while (nz--) {
1587f1af5d2fSBarry Smith       idx   = 5*(*vi--);
1588f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1589f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1590f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1591f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1592f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1593f1af5d2fSBarry Smith       v -= 25;
1594f1af5d2fSBarry Smith     }
1595f1af5d2fSBarry Smith   }
1596f1af5d2fSBarry Smith 
1597f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1598f1af5d2fSBarry Smith   ii = 0;
1599f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1600f1af5d2fSBarry Smith     ir      = 5*r[i];
1601f1af5d2fSBarry Smith     x[ir]   = t[ii];
1602f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1603f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1604f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1605f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1606f1af5d2fSBarry Smith     ii += 5;
1607f1af5d2fSBarry Smith   }
1608f1af5d2fSBarry Smith 
1609f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1610f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
16113649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
16121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1613dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1614f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1615f1af5d2fSBarry Smith }
1616f1af5d2fSBarry Smith 
16174a2ae208SSatish Balay #undef __FUNCT__
16184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
16194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
162032121132SShri Abhyankar {
162132121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
162232121132SShri Abhyankar   PetscErrorCode    ierr;
162332121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1624b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
162532121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
162632121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1627b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1628b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1629b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1630b3260449SShri Abhyankar   const PetscScalar *b;
163132121132SShri Abhyankar 
163232121132SShri Abhyankar   PetscFunctionBegin;
16333649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
163432121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
163532121132SShri Abhyankar   t = a->solve_work;
163632121132SShri Abhyankar 
163732121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
163832121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
163932121132SShri Abhyankar 
164032121132SShri Abhyankar   /* copy b into temp work space according to permutation */
164132121132SShri Abhyankar   for(i=0;i<n;i++){
164232121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
164332121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
164432121132SShri Abhyankar     t[ii+4] = b[ic+4];
164532121132SShri Abhyankar   }
164632121132SShri Abhyankar 
164732121132SShri Abhyankar   /* forward solve the U^T */
164832121132SShri Abhyankar   idx = 0;
164932121132SShri Abhyankar   for (i=0; i<n; i++) {
165032121132SShri Abhyankar     v     = aa + bs2*diag[i];
165132121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
165232121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
165332121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
165432121132SShri Abhyankar     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
165532121132SShri Abhyankar     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
165632121132SShri Abhyankar     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
165732121132SShri Abhyankar     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
165832121132SShri Abhyankar     v -= bs2;
165932121132SShri Abhyankar 
166032121132SShri Abhyankar     vi    = aj + diag[i] - 1;
166132121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
166232121132SShri Abhyankar     for(j=0;j>-nz;j--){
166332121132SShri Abhyankar       oidx = bs*vi[j];
166432121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
166532121132SShri Abhyankar       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
166632121132SShri Abhyankar       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
166732121132SShri Abhyankar       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
166832121132SShri Abhyankar       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
166932121132SShri Abhyankar       v  -= bs2;
167032121132SShri Abhyankar     }
167132121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
167232121132SShri Abhyankar     idx += bs;
167332121132SShri Abhyankar   }
167432121132SShri Abhyankar   /* backward solve the L^T */
167532121132SShri Abhyankar   for (i=n-1; i>=0; i--){
167632121132SShri Abhyankar     v    = aa + bs2*ai[i];
167732121132SShri Abhyankar     vi   = aj + ai[i];
167832121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
167932121132SShri Abhyankar     idt  = bs*i;
168032121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
168132121132SShri Abhyankar     for(j=0;j<nz;j++){
168232121132SShri Abhyankar       idx   = bs*vi[j];
168332121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
168432121132SShri Abhyankar       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
168532121132SShri Abhyankar       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
168632121132SShri Abhyankar       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
168732121132SShri Abhyankar       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
168832121132SShri Abhyankar       v += bs2;
168932121132SShri Abhyankar     }
169032121132SShri Abhyankar   }
169132121132SShri Abhyankar 
169232121132SShri Abhyankar   /* copy t into x according to permutation */
169332121132SShri Abhyankar   for(i=0;i<n;i++){
169432121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
169532121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
169632121132SShri Abhyankar     x[ir+4] = t[ii+4];
169732121132SShri Abhyankar   }
169832121132SShri Abhyankar 
169932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
170032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
17013649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
170232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
170332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
170432121132SShri Abhyankar   PetscFunctionReturn(0);
170532121132SShri Abhyankar }
170632121132SShri Abhyankar 
170732121132SShri Abhyankar #undef __FUNCT__
170806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
170906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1710f1af5d2fSBarry Smith {
1711f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1712f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
17136849ba73SBarry Smith   PetscErrorCode    ierr;
17145d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1715b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1716b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1717b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1718b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1719b3260449SShri Abhyankar   const PetscScalar *b;
1720f1af5d2fSBarry Smith 
1721f1af5d2fSBarry Smith   PetscFunctionBegin;
17223649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
17231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1724f1af5d2fSBarry Smith   t  = a->solve_work;
1725f1af5d2fSBarry Smith 
1726f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1727f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1728f1af5d2fSBarry Smith 
1729f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1730f1af5d2fSBarry Smith   ii = 0;
1731f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1732f1af5d2fSBarry Smith     ic      = 6*c[i];
1733f1af5d2fSBarry Smith     t[ii]   = b[ic];
1734f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1735f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1736f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1737f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1738f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1739f1af5d2fSBarry Smith     ii += 6;
1740f1af5d2fSBarry Smith   }
1741f1af5d2fSBarry Smith 
1742f1af5d2fSBarry Smith   /* forward solve the U^T */
1743f1af5d2fSBarry Smith   idx = 0;
1744f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1745f1af5d2fSBarry Smith 
1746f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
1747f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1748f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1749f1af5d2fSBarry Smith     x6    = t[5+idx];
1750f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1751f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1752f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1753f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1754f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1755f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1756f1af5d2fSBarry Smith     v += 36;
1757f1af5d2fSBarry Smith 
1758f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1759f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1760f1af5d2fSBarry Smith     while (nz--) {
1761f1af5d2fSBarry Smith       oidx = 6*(*vi++);
1762f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1763f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1764f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1765f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1766f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1767f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1768f1af5d2fSBarry Smith       v  += 36;
1769f1af5d2fSBarry Smith     }
1770f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1771f1af5d2fSBarry Smith     t[5+idx] = s6;
1772f1af5d2fSBarry Smith     idx += 6;
1773f1af5d2fSBarry Smith   }
1774f1af5d2fSBarry Smith   /* backward solve the L^T */
1775f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1776f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
1777f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1778f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1779f1af5d2fSBarry Smith     idt  = 6*i;
1780f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1781f1af5d2fSBarry Smith     s6 = t[5+idt];
1782f1af5d2fSBarry Smith     while (nz--) {
1783f1af5d2fSBarry Smith       idx   = 6*(*vi--);
1784f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1785f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1786f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1787f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1788f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1789f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1790f1af5d2fSBarry Smith       v -= 36;
1791f1af5d2fSBarry Smith     }
1792f1af5d2fSBarry Smith   }
1793f1af5d2fSBarry Smith 
1794f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1795f1af5d2fSBarry Smith   ii = 0;
1796f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1797f1af5d2fSBarry Smith     ir      = 6*r[i];
1798f1af5d2fSBarry Smith     x[ir]   = t[ii];
1799f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1800f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1801f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1802f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1803f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1804f1af5d2fSBarry Smith     ii += 6;
1805f1af5d2fSBarry Smith   }
1806f1af5d2fSBarry Smith 
1807f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1808f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18093649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
18101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1811dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1812f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1813f1af5d2fSBarry Smith }
1814f1af5d2fSBarry Smith 
18154a2ae208SSatish Balay #undef __FUNCT__
18164dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
18174dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
181832121132SShri Abhyankar {
181932121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
182032121132SShri Abhyankar   PetscErrorCode    ierr;
182132121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1822b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
182332121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
182432121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1825b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1826b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1827b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1828b3260449SShri Abhyankar   const PetscScalar *b;
182932121132SShri Abhyankar 
183032121132SShri Abhyankar   PetscFunctionBegin;
18313649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
183232121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
183332121132SShri Abhyankar   t = a->solve_work;
183432121132SShri Abhyankar 
183532121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
183632121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
183732121132SShri Abhyankar 
183832121132SShri Abhyankar   /* copy b into temp work space according to permutation */
183932121132SShri Abhyankar   for(i=0;i<n;i++){
184032121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
184132121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
184232121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
184332121132SShri Abhyankar   }
184432121132SShri Abhyankar 
184532121132SShri Abhyankar   /* forward solve the U^T */
184632121132SShri Abhyankar   idx = 0;
184732121132SShri Abhyankar   for (i=0; i<n; i++) {
184832121132SShri Abhyankar     v     = aa + bs2*diag[i];
184932121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
185032121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
185132121132SShri Abhyankar     x6    = t[5+idx];
185232121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
185332121132SShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
185432121132SShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
185532121132SShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
185632121132SShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
185732121132SShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
185832121132SShri Abhyankar     v -= bs2;
185932121132SShri Abhyankar 
186032121132SShri Abhyankar     vi    = aj + diag[i] - 1;
186132121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
186232121132SShri Abhyankar     for(j=0;j>-nz;j--){
186332121132SShri Abhyankar       oidx = bs*vi[j];
186432121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
186532121132SShri Abhyankar       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
186632121132SShri Abhyankar       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
186732121132SShri Abhyankar       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
186832121132SShri Abhyankar       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
186932121132SShri Abhyankar       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
187032121132SShri Abhyankar       v  -= bs2;
187132121132SShri Abhyankar     }
187232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
187332121132SShri Abhyankar     t[5+idx] = s6;
187432121132SShri Abhyankar     idx += bs;
187532121132SShri Abhyankar   }
187632121132SShri Abhyankar   /* backward solve the L^T */
187732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
187832121132SShri Abhyankar     v    = aa + bs2*ai[i];
187932121132SShri Abhyankar     vi   = aj + ai[i];
188032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
188132121132SShri Abhyankar     idt  = bs*i;
188232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
188332121132SShri Abhyankar     s6   = t[5+idt];
188432121132SShri Abhyankar    for(j=0;j<nz;j++){
188532121132SShri Abhyankar       idx   = bs*vi[j];
188632121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
188732121132SShri Abhyankar       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
188832121132SShri Abhyankar       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
188932121132SShri Abhyankar       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
189032121132SShri Abhyankar       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
189132121132SShri Abhyankar       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
189232121132SShri Abhyankar       v += bs2;
189332121132SShri Abhyankar     }
189432121132SShri Abhyankar   }
189532121132SShri Abhyankar 
189632121132SShri Abhyankar   /* copy t into x according to permutation */
189732121132SShri Abhyankar   for(i=0;i<n;i++){
189832121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
189932121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
190032121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
190132121132SShri Abhyankar   }
190232121132SShri Abhyankar 
190332121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
190432121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
19053649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
190632121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
190732121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
190832121132SShri Abhyankar   PetscFunctionReturn(0);
190932121132SShri Abhyankar }
191032121132SShri Abhyankar 
191132121132SShri Abhyankar #undef __FUNCT__
191206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
191306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1914f1af5d2fSBarry Smith {
1915f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1916f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
19176849ba73SBarry Smith   PetscErrorCode    ierr;
19185d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1919b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1920b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1921b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1922b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1923b3260449SShri Abhyankar   const PetscScalar *b;
1924f1af5d2fSBarry Smith 
1925f1af5d2fSBarry Smith   PetscFunctionBegin;
19263649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
19271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1928f1af5d2fSBarry Smith   t  = a->solve_work;
1929f1af5d2fSBarry Smith 
1930f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1931f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1932f1af5d2fSBarry Smith 
1933f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1934f1af5d2fSBarry Smith   ii = 0;
1935f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1936f1af5d2fSBarry Smith     ic      = 7*c[i];
1937f1af5d2fSBarry Smith     t[ii]   = b[ic];
1938f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1939f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1940f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1941f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1942f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1943f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1944f1af5d2fSBarry Smith     ii += 7;
1945f1af5d2fSBarry Smith   }
1946f1af5d2fSBarry Smith 
1947f1af5d2fSBarry Smith   /* forward solve the U^T */
1948f1af5d2fSBarry Smith   idx = 0;
1949f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1950f1af5d2fSBarry Smith 
1951f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1952f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1953f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1954f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1955f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1956f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1957f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1958f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1959f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1960f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1961f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1962f1af5d2fSBarry Smith     v += 49;
1963f1af5d2fSBarry Smith 
1964f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1965f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1966f1af5d2fSBarry Smith     while (nz--) {
1967f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1968f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1969f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1970f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1971f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1972f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1973f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1974f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1975f1af5d2fSBarry Smith       v  += 49;
1976f1af5d2fSBarry Smith     }
1977f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1978f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1979f1af5d2fSBarry Smith     idx += 7;
1980f1af5d2fSBarry Smith   }
1981f1af5d2fSBarry Smith   /* backward solve the L^T */
1982f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1983f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1984f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1985f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1986f1af5d2fSBarry Smith     idt  = 7*i;
1987f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1988f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1989f1af5d2fSBarry Smith     while (nz--) {
1990f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1991f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1992f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1993f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1994f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1995f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1996f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1997f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1998f1af5d2fSBarry Smith       v -= 49;
1999f1af5d2fSBarry Smith     }
2000f1af5d2fSBarry Smith   }
2001f1af5d2fSBarry Smith 
2002f1af5d2fSBarry Smith   /* copy t into x according to permutation */
2003f1af5d2fSBarry Smith   ii = 0;
2004f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
2005f1af5d2fSBarry Smith     ir      = 7*r[i];
2006f1af5d2fSBarry Smith     x[ir]   = t[ii];
2007f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
2008f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
2009f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
2010f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
2011f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
2012f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
2013f1af5d2fSBarry Smith     ii += 7;
2014f1af5d2fSBarry Smith   }
2015f1af5d2fSBarry Smith 
2016f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2017f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20183649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
20191ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2020dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2021f1af5d2fSBarry Smith   PetscFunctionReturn(0);
2022f1af5d2fSBarry Smith }
202332121132SShri Abhyankar #undef __FUNCT__
20244dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
20254dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
202632121132SShri Abhyankar {
202732121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
202832121132SShri Abhyankar   PetscErrorCode    ierr;
202932121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
2030b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
203132121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
203232121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2033b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2034b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2035b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2036b3260449SShri Abhyankar   const PetscScalar *b;
203732121132SShri Abhyankar 
203832121132SShri Abhyankar   PetscFunctionBegin;
20393649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
204032121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
204132121132SShri Abhyankar   t = a->solve_work;
204232121132SShri Abhyankar 
204332121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
204432121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
204532121132SShri Abhyankar 
204632121132SShri Abhyankar   /* copy b into temp work space according to permutation */
204732121132SShri Abhyankar   for(i=0;i<n;i++){
204832121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
204932121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
205032121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
205132121132SShri Abhyankar   }
205232121132SShri Abhyankar 
205332121132SShri Abhyankar   /* forward solve the U^T */
205432121132SShri Abhyankar   idx = 0;
205532121132SShri Abhyankar   for (i=0; i<n; i++) {
205632121132SShri Abhyankar     v     = aa + bs2*diag[i];
205732121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
205832121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
205932121132SShri Abhyankar     x6    = t[5+idx]; x7 = t[6+idx];
206032121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
206132121132SShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
206232121132SShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
206332121132SShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
206432121132SShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
206532121132SShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
206632121132SShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
206732121132SShri Abhyankar     v -= bs2;
206832121132SShri Abhyankar 
206932121132SShri Abhyankar     vi    = aj + diag[i] - 1;
207032121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
207132121132SShri Abhyankar     for(j=0;j>-nz;j--){
207232121132SShri Abhyankar       oidx = bs*vi[j];
207332121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
207432121132SShri Abhyankar       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
207532121132SShri Abhyankar       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
207632121132SShri Abhyankar       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
207732121132SShri Abhyankar       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
207832121132SShri Abhyankar       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
207932121132SShri Abhyankar       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
208032121132SShri Abhyankar       v  -= bs2;
208132121132SShri Abhyankar     }
208232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
208332121132SShri Abhyankar     t[5+idx] = s6;  t[6+idx] = s7;
208432121132SShri Abhyankar     idx += bs;
208532121132SShri Abhyankar   }
208632121132SShri Abhyankar   /* backward solve the L^T */
208732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
208832121132SShri Abhyankar     v    = aa + bs2*ai[i];
208932121132SShri Abhyankar     vi   = aj + ai[i];
209032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
209132121132SShri Abhyankar     idt  = bs*i;
209232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
209332121132SShri Abhyankar     s6   = t[5+idt];  s7 = t[6+idt];
209432121132SShri Abhyankar    for(j=0;j<nz;j++){
209532121132SShri Abhyankar       idx   = bs*vi[j];
209632121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
209732121132SShri Abhyankar       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
209832121132SShri Abhyankar       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
209932121132SShri Abhyankar       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
210032121132SShri Abhyankar       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
210132121132SShri Abhyankar       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
210232121132SShri Abhyankar       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
210332121132SShri Abhyankar       v += bs2;
210432121132SShri Abhyankar     }
210532121132SShri Abhyankar   }
210632121132SShri Abhyankar 
210732121132SShri Abhyankar   /* copy t into x according to permutation */
210832121132SShri Abhyankar   for(i=0;i<n;i++){
210932121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
211032121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
211132121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
211232121132SShri Abhyankar   }
211332121132SShri Abhyankar 
211432121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
211532121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21163649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
211732121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
211832121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
211932121132SShri Abhyankar   PetscFunctionReturn(0);
212032121132SShri Abhyankar }
2121f1af5d2fSBarry Smith 
21224e2b4712SSatish Balay /* ----------------------------------------------------------- */
21234a2ae208SSatish Balay #undef __FUNCT__
212406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
212506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21264e2b4712SSatish Balay {
21274e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21284e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
21296849ba73SBarry Smith   PetscErrorCode    ierr;
2130b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2131b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2132b3260449SShri Abhyankar   PetscInt          i,nz;
2133b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2134b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2135b3260449SShri Abhyankar   PetscScalar       *x,*s,*t,*ls;
2136b3260449SShri Abhyankar   const PetscScalar *b;
21374e2b4712SSatish Balay 
21384e2b4712SSatish Balay   PetscFunctionBegin;
21393649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2141f1af5d2fSBarry Smith   t  = a->solve_work;
21424e2b4712SSatish Balay 
21434e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21444e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
21454e2b4712SSatish Balay 
21464e2b4712SSatish Balay   /* forward solve the lower triangular */
214787828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21484e2b4712SSatish Balay   for (i=1; i<n; i++) {
21494e2b4712SSatish Balay     v   = aa + bs2*ai[i];
21504e2b4712SSatish Balay     vi  = aj + ai[i];
21514e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
2152f1af5d2fSBarry Smith     s = t + bs*i;
215387828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21544e2b4712SSatish Balay     while (nz--) {
2155f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
21564e2b4712SSatish Balay       v += bs2;
21574e2b4712SSatish Balay     }
21584e2b4712SSatish Balay   }
21594e2b4712SSatish Balay   /* backward solve the upper triangular */
2160d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
21614e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
21624e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
21634e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
21644e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
216587828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21664e2b4712SSatish Balay     while (nz--) {
2167f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
21684e2b4712SSatish Balay       v += bs2;
21694e2b4712SSatish Balay     }
2170f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
217187828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21724e2b4712SSatish Balay   }
21734e2b4712SSatish Balay 
21744e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21754e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21763649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
21771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2178dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
21794e2b4712SSatish Balay   PetscFunctionReturn(0);
21804e2b4712SSatish Balay }
21814e2b4712SSatish Balay 
21825c42ef9dSBarry Smith /* ----------------------------------------------------------- */
21835c42ef9dSBarry Smith #undef __FUNCT__
218406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
218506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21865c42ef9dSBarry Smith {
21875c42ef9dSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21885c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
21895c42ef9dSBarry Smith   PetscErrorCode    ierr;
21905c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2191b3260449SShri Abhyankar   PetscInt          i,nz,j;
2192b3260449SShri Abhyankar   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
21935c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
21945c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
21955c42ef9dSBarry Smith   const PetscScalar *b;
21965c42ef9dSBarry Smith   PetscFunctionBegin;
21973649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21985c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21995c42ef9dSBarry Smith   t    = a->solve_work;
22005c42ef9dSBarry Smith 
22015c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22025c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22035c42ef9dSBarry Smith 
22045c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
22055c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22065c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22075c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
22085c42ef9dSBarry Smith     }
22095c42ef9dSBarry Smith   }
22105c42ef9dSBarry Smith 
22115c42ef9dSBarry Smith 
22125c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
22135c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
22145c42ef9dSBarry Smith   for (i=0; i<n; i++){
22155c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22165c42ef9dSBarry Smith     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
22175c42ef9dSBarry Smith     v   = aa + bs2*(a->diag[i] + 1);
22185c42ef9dSBarry Smith     vi  = aj + a->diag[i] + 1;
22195c42ef9dSBarry Smith     nz  = ai[i+1] - a->diag[i] - 1;
22205c42ef9dSBarry Smith     while (nz--) {
22215c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22225c42ef9dSBarry Smith       v += bs2;
22235c42ef9dSBarry Smith     }
22245c42ef9dSBarry Smith   }
22255c42ef9dSBarry Smith 
22265c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
22275c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
22285c42ef9dSBarry Smith     v   = aa + bs2*ai[i];
22295c42ef9dSBarry Smith     vi  = aj + ai[i];
22305c42ef9dSBarry Smith     nz  = a->diag[i] - ai[i];
22315c42ef9dSBarry Smith     while (nz--) {
22325c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22335c42ef9dSBarry Smith       v += bs2;
22345c42ef9dSBarry Smith     }
22355c42ef9dSBarry Smith   }
22365c42ef9dSBarry Smith 
22375c42ef9dSBarry Smith   /* copy t into x according to permutation */
22385c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22395c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22405c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
22415c42ef9dSBarry Smith     }
22425c42ef9dSBarry Smith   }
22435c42ef9dSBarry Smith 
22445c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22455c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22463649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
22475c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22485c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
22495c42ef9dSBarry Smith   PetscFunctionReturn(0);
22505c42ef9dSBarry Smith }
22515c42ef9dSBarry Smith 
22524a2ae208SSatish Balay #undef __FUNCT__
22534dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
22544dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
22558499736aSShri Abhyankar {
22568499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
22578499736aSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
22588499736aSShri Abhyankar   PetscErrorCode    ierr;
2259b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2260b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2261b3260449SShri Abhyankar   PetscInt          i,j,nz;
2262b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
22638499736aSShri Abhyankar   const MatScalar   *aa=a->a,*v;
22648499736aSShri Abhyankar   PetscScalar       *x,*t,*ls;
22658499736aSShri Abhyankar   const PetscScalar *b;
2266b3260449SShri Abhyankar 
22678499736aSShri Abhyankar   PetscFunctionBegin;
22683649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
22698499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22708499736aSShri Abhyankar   t    = a->solve_work;
22718499736aSShri Abhyankar 
22728499736aSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22738499736aSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22748499736aSShri Abhyankar 
22758499736aSShri Abhyankar   /* copy the b into temp work space according to permutation */
22768499736aSShri Abhyankar   for (i=0; i<n; i++) {
22778499736aSShri Abhyankar     for (j=0; j<bs; j++) {
22788499736aSShri Abhyankar       t[i*bs+j] = b[c[i]*bs+j];
22798499736aSShri Abhyankar     }
22808499736aSShri Abhyankar   }
22818499736aSShri Abhyankar 
22828499736aSShri Abhyankar 
22838499736aSShri Abhyankar   /* forward solve the upper triangular transpose */
22848499736aSShri Abhyankar   ls = a->solve_work + A->cmap->n;
22858499736aSShri Abhyankar   for (i=0; i<n; i++){
22868499736aSShri Abhyankar     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22878499736aSShri Abhyankar     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
22888499736aSShri Abhyankar     v   = aa + bs2*(diag[i] - 1);
22898499736aSShri Abhyankar     vi  = aj + diag[i] - 1;
22908499736aSShri Abhyankar     nz  = diag[i] - diag[i+1] - 1;
22918499736aSShri Abhyankar     for(j=0;j>-nz;j--){
22928499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
22938499736aSShri Abhyankar       v -= bs2;
22948499736aSShri Abhyankar     }
22958499736aSShri Abhyankar   }
22968499736aSShri Abhyankar 
22978499736aSShri Abhyankar   /* backward solve the lower triangular transpose */
22988499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
22998499736aSShri Abhyankar     v   = aa + bs2*ai[i];
23008499736aSShri Abhyankar     vi  = aj + ai[i];
23018499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
23028499736aSShri Abhyankar     for(j=0;j<nz;j++){
23038499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
23048499736aSShri Abhyankar       v += bs2;
23058499736aSShri Abhyankar     }
23068499736aSShri Abhyankar   }
23078499736aSShri Abhyankar 
23088499736aSShri Abhyankar   /* copy t into x according to permutation */
23098499736aSShri Abhyankar   for (i=0; i<n; i++) {
23108499736aSShri Abhyankar     for (j=0; j<bs; j++) {
23118499736aSShri Abhyankar       x[bs*r[i]+j]   = t[bs*i+j];
23128499736aSShri Abhyankar     }
23138499736aSShri Abhyankar   }
23148499736aSShri Abhyankar 
23158499736aSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23168499736aSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23173649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
23188499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
23198499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
23208499736aSShri Abhyankar   PetscFunctionReturn(0);
23218499736aSShri Abhyankar }
23228499736aSShri Abhyankar 
2323832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
232429a97285SShri Abhyankar 
23252b0b2ea7SShri Abhyankar #undef __FUNCT__
2326832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2327832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
23282b0b2ea7SShri Abhyankar {
23292b0b2ea7SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
23302b0b2ea7SShri Abhyankar   PetscErrorCode    ierr;
2331b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
23320fa040f9SShri Abhyankar   PetscInt          i,nz,idx,idt,m;
23330b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
23342b0b2ea7SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
23352b0b2ea7SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
23360fa040f9SShri Abhyankar   PetscScalar       *x;
23370b68f018SBarry Smith   const PetscScalar *b;
23382b0b2ea7SShri Abhyankar 
23392b0b2ea7SShri Abhyankar   PetscFunctionBegin;
23403649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
23412b0b2ea7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23422b0b2ea7SShri Abhyankar 
23432b0b2ea7SShri Abhyankar   /* forward solve the lower triangular */
234429a97285SShri Abhyankar   idx    = 0;
23450fa040f9SShri Abhyankar   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
23460fa040f9SShri Abhyankar   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
23470fa040f9SShri Abhyankar   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
23482b0b2ea7SShri Abhyankar 
23492b0b2ea7SShri Abhyankar   for (i=1; i<n; i++) {
23502b0b2ea7SShri Abhyankar     v     = aa + bs2*ai[i];
23512b0b2ea7SShri Abhyankar     vi    = aj + ai[i];
23522b0b2ea7SShri Abhyankar     nz    = ai[i+1] - ai[i];
23530fa040f9SShri Abhyankar     idt   = bs*i;
23540fa040f9SShri Abhyankar     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
23550fa040f9SShri Abhyankar     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
23560fa040f9SShri Abhyankar     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
23572b0b2ea7SShri Abhyankar     for(m=0;m<nz;m++){
23582b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
23590fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
23600fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
23610fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
23622b0b2ea7SShri Abhyankar 
23630b8f6341SShri Abhyankar 
23642b0b2ea7SShri Abhyankar       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
23652b0b2ea7SShri Abhyankar       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
23662b0b2ea7SShri Abhyankar       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
23672b0b2ea7SShri Abhyankar       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
23682b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
23692b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
23702b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
23712b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
23722b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
23732b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
23742b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
23752b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
23762b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
23772b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
23782b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
23792b0b2ea7SShri Abhyankar 
23802b0b2ea7SShri Abhyankar       v += bs2;
23812b0b2ea7SShri Abhyankar     }
23820fa040f9SShri Abhyankar     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
23830fa040f9SShri Abhyankar     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
23840fa040f9SShri Abhyankar     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
23852b0b2ea7SShri Abhyankar 
23862b0b2ea7SShri Abhyankar   }
23872b0b2ea7SShri Abhyankar   /* backward solve the upper triangular */
23882b0b2ea7SShri Abhyankar   for (i=n-1; i>=0; i--){
23892b0b2ea7SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
23902b0b2ea7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
23912b0b2ea7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
23922b0b2ea7SShri Abhyankar     idt  = bs*i;
23930fa040f9SShri Abhyankar     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
23940fa040f9SShri Abhyankar     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
23950fa040f9SShri Abhyankar     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
23962b0b2ea7SShri Abhyankar 
23972b0b2ea7SShri Abhyankar     for(m=0;m<nz;m++){
23982b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
23990fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
24000fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
24010fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
24022b0b2ea7SShri Abhyankar 
24032b0b2ea7SShri Abhyankar       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
24042b0b2ea7SShri Abhyankar       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
24052b0b2ea7SShri Abhyankar       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
24062b0b2ea7SShri Abhyankar       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
24072b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
24082b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
24092b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
24102b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
24112b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
24122b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
24132b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
24142b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
24152b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
24162b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
24172b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
24182b0b2ea7SShri Abhyankar 
24192b0b2ea7SShri Abhyankar       v += bs2;
24202b0b2ea7SShri Abhyankar     }
24212b0b2ea7SShri Abhyankar 
24220fa040f9SShri Abhyankar     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
24230fa040f9SShri Abhyankar     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
24240fa040f9SShri Abhyankar     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
24250fa040f9SShri Abhyankar     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
24260fa040f9SShri Abhyankar     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
24270fa040f9SShri Abhyankar     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
24280fa040f9SShri Abhyankar     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
24290fa040f9SShri Abhyankar     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
24300fa040f9SShri Abhyankar     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
24310fa040f9SShri Abhyankar     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
24320fa040f9SShri Abhyankar     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
24330fa040f9SShri Abhyankar     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
24340fa040f9SShri Abhyankar     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
24350fa040f9SShri Abhyankar     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
24360fa040f9SShri Abhyankar     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
24372b0b2ea7SShri Abhyankar 
24382b0b2ea7SShri Abhyankar   }
24392b0b2ea7SShri Abhyankar 
24403649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
24412b0b2ea7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24422b0b2ea7SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
24432b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
24442b0b2ea7SShri Abhyankar }
24452b0b2ea7SShri Abhyankar 
2446832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2447832cc040SShri Abhyankar /* Default MatSolve for block size 15 */
2448832cc040SShri Abhyankar 
24498499736aSShri Abhyankar #undef __FUNCT__
2450832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2451832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
24520b8f6341SShri Abhyankar {
24530b8f6341SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
24540b8f6341SShri Abhyankar   PetscErrorCode    ierr;
24550b8f6341SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
245653ef36baSBarry Smith   PetscInt          i,k,nz,idx,idt,m;
24570b8f6341SShri Abhyankar   const MatScalar   *aa=a->a,*v;
24580b8f6341SShri Abhyankar   PetscScalar       s[15];
245953ef36baSBarry Smith   PetscScalar       *x,xv;
24600b8f6341SShri Abhyankar   const PetscScalar *b;
24610b8f6341SShri Abhyankar 
24620b8f6341SShri Abhyankar   PetscFunctionBegin;
24633649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
24640b8f6341SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24650b8f6341SShri Abhyankar 
24660b8f6341SShri Abhyankar   /* forward solve the lower triangular */
2467832cc040SShri Abhyankar   for (i=0; i<n; i++) {
24680b8f6341SShri Abhyankar     v     = aa + bs2*ai[i];
24690b8f6341SShri Abhyankar     vi    = aj + ai[i];
24700b8f6341SShri Abhyankar     nz    = ai[i+1] - ai[i];
24710fa040f9SShri Abhyankar     idt   = bs*i;
2472832cc040SShri Abhyankar     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2473832cc040SShri Abhyankar     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2474832cc040SShri Abhyankar     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
24750b8f6341SShri Abhyankar     for(m=0;m<nz;m++){
24760b8f6341SShri Abhyankar       idx   = bs*vi[m];
24770b8f6341SShri Abhyankar       for(k=0;k<15;k++){
247853ef36baSBarry Smith 	xv        = x[k + idx];
247953ef36baSBarry Smith 	x[idt]    -= v[0]*xv;
248053ef36baSBarry Smith 	x[1+idt]  -= v[1]*xv;
248153ef36baSBarry Smith 	x[2+idt]  -= v[2]*xv;
248253ef36baSBarry Smith         x[3+idt]  -= v[3]*xv;
248353ef36baSBarry Smith 	x[4+idt]  -= v[4]*xv;
248453ef36baSBarry Smith 	x[5+idt]  -= v[5]*xv;
248553ef36baSBarry Smith 	x[6+idt]  -= v[6]*xv;
248653ef36baSBarry Smith         x[7+idt]  -= v[7]*xv;
248753ef36baSBarry Smith 	x[8+idt]  -= v[8]*xv;
248853ef36baSBarry Smith 	x[9+idt]  -= v[9]*xv;
248953ef36baSBarry Smith 	x[10+idt] -= v[10]*xv;
249053ef36baSBarry Smith         x[11+idt] -= v[11]*xv;
249153ef36baSBarry Smith 	x[12+idt] -= v[12]*xv;
249253ef36baSBarry Smith 	x[13+idt] -= v[13]*xv;
249353ef36baSBarry Smith 	x[14+idt] -= v[14]*xv;
24940b8f6341SShri Abhyankar 	v += 15;
24950b8f6341SShri Abhyankar       }
24960b8f6341SShri Abhyankar     }
24970b8f6341SShri Abhyankar   }
24980b8f6341SShri Abhyankar   /* backward solve the upper triangular */
24990b8f6341SShri Abhyankar   for (i=n-1; i>=0; i--){
25000b8f6341SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
25010b8f6341SShri Abhyankar     vi   = aj + adiag[i+1]+1;
25020b8f6341SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
25030b8f6341SShri Abhyankar     idt  = bs*i;
25040fa040f9SShri Abhyankar     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
25050fa040f9SShri Abhyankar     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
25060fa040f9SShri Abhyankar     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
25070b8f6341SShri Abhyankar 
25080b8f6341SShri Abhyankar     for(m=0;m<nz;m++){
25090b8f6341SShri Abhyankar       idx   = bs*vi[m];
25100b8f6341SShri Abhyankar       for(k=0;k<15;k++){
251153ef36baSBarry Smith 	xv = x[k + idx];
251253ef36baSBarry Smith 	s[0]  -= v[0]*xv;
251353ef36baSBarry Smith 	s[1]  -= v[1]*xv;
251453ef36baSBarry Smith 	s[2]  -= v[2]*xv;
251553ef36baSBarry Smith         s[3]  -= v[3]*xv;
251653ef36baSBarry Smith 	s[4]  -= v[4]*xv;
251753ef36baSBarry Smith 	s[5]  -= v[5]*xv;
251853ef36baSBarry Smith 	s[6]  -= v[6]*xv;
251953ef36baSBarry Smith         s[7]  -= v[7]*xv;
252053ef36baSBarry Smith 	s[8]  -= v[8]*xv;
252153ef36baSBarry Smith 	s[9]  -= v[9]*xv;
252253ef36baSBarry Smith 	s[10] -= v[10]*xv;
252353ef36baSBarry Smith         s[11] -= v[11]*xv;
252453ef36baSBarry Smith 	s[12] -= v[12]*xv;
252553ef36baSBarry Smith 	s[13] -= v[13]*xv;
252653ef36baSBarry Smith 	s[14] -= v[14]*xv;
25270b8f6341SShri Abhyankar 	v += 15;
25280b8f6341SShri Abhyankar       }
25290b8f6341SShri Abhyankar     }
25300fa040f9SShri Abhyankar     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
25310b8f6341SShri Abhyankar     for(k=0;k<15;k++){
25320fa040f9SShri Abhyankar       x[idt]    += v[0]*s[k];
25330fa040f9SShri Abhyankar       x[1+idt]  += v[1]*s[k];
25340fa040f9SShri Abhyankar       x[2+idt]  += v[2]*s[k];
25350fa040f9SShri Abhyankar       x[3+idt]  += v[3]*s[k];
25360fa040f9SShri Abhyankar       x[4+idt]  += v[4]*s[k];
25370fa040f9SShri Abhyankar       x[5+idt]  += v[5]*s[k];
25380fa040f9SShri Abhyankar       x[6+idt]  += v[6]*s[k];
25390fa040f9SShri Abhyankar       x[7+idt]  += v[7]*s[k];
25400fa040f9SShri Abhyankar       x[8+idt]  += v[8]*s[k];
25410fa040f9SShri Abhyankar       x[9+idt]  += v[9]*s[k];
25420fa040f9SShri Abhyankar       x[10+idt] += v[10]*s[k];
25430fa040f9SShri Abhyankar       x[11+idt] += v[11]*s[k];
25440fa040f9SShri Abhyankar       x[12+idt] += v[12]*s[k];
25450fa040f9SShri Abhyankar       x[13+idt] += v[13]*s[k];
25460fa040f9SShri Abhyankar       x[14+idt] += v[14]*s[k];
25470b8f6341SShri Abhyankar       v += 15;
25480b8f6341SShri Abhyankar     }
25490b8f6341SShri Abhyankar   }
25503649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
25510b8f6341SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
25520b8f6341SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
25530b8f6341SShri Abhyankar   PetscFunctionReturn(0);
25540b8f6341SShri Abhyankar }
25550b8f6341SShri Abhyankar 
25560b8f6341SShri Abhyankar 
25570b8f6341SShri Abhyankar #undef __FUNCT__
255806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
255906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
25604e2b4712SSatish Balay {
25614e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
25624e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
25636849ba73SBarry Smith   PetscErrorCode    ierr;
2564b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2565b3260449SShri Abhyankar   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2566b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2567b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2568b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2569b3260449SShri Abhyankar   const PetscScalar *b;
25704e2b4712SSatish Balay 
25714e2b4712SSatish Balay   PetscFunctionBegin;
25723649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
25731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2574f1af5d2fSBarry Smith   t  = a->solve_work;
25754e2b4712SSatish Balay 
25764e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25774e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
25784e2b4712SSatish Balay 
25794e2b4712SSatish Balay   /* forward solve the lower triangular */
25804e2b4712SSatish Balay   idx    = 7*(*r++);
2581f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2582f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2583f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
25844e2b4712SSatish Balay 
25854e2b4712SSatish Balay   for (i=1; i<n; i++) {
25864e2b4712SSatish Balay     v     = aa + 49*ai[i];
25874e2b4712SSatish Balay     vi    = aj + ai[i];
25884e2b4712SSatish Balay     nz    = diag[i] - ai[i];
25894e2b4712SSatish Balay     idx   = 7*(*r++);
2590f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2591f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
25924e2b4712SSatish Balay     while (nz--) {
25934e2b4712SSatish Balay       idx   = 7*(*vi++);
2594f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2595f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2596f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
2597f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2598f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2599f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2600f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2601f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2602f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2603f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26044e2b4712SSatish Balay       v += 49;
26054e2b4712SSatish Balay     }
26064e2b4712SSatish Balay     idx = 7*i;
2607f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2608f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2609f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
26104e2b4712SSatish Balay   }
26114e2b4712SSatish Balay   /* backward solve the upper triangular */
26124e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
26134e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
26144e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
26154e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
26164e2b4712SSatish Balay     idt  = 7*i;
2617f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2618f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2619f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
26204e2b4712SSatish Balay     while (nz--) {
26214e2b4712SSatish Balay       idx   = 7*(*vi++);
2622f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2623f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2624f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
2625f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2626f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2627f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2628f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2629f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2630f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2631f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26324e2b4712SSatish Balay       v += 49;
26334e2b4712SSatish Balay     }
26344e2b4712SSatish Balay     idc = 7*(*c--);
26354e2b4712SSatish Balay     v   = aa + 49*diag[i];
2636f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2637f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2638f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2639f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2640f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2641f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2642f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2643f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2644f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2645f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2646f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2647f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2648f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2649f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
26504e2b4712SSatish Balay   }
26514e2b4712SSatish Balay 
26524e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
26534e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
26543649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
26551ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2656dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
26574e2b4712SSatish Balay   PetscFunctionReturn(0);
26584e2b4712SSatish Balay }
26594e2b4712SSatish Balay 
26608f690400SShri Abhyankar #undef __FUNCT__
26614dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7"
26624dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
266335aa4fcfSShri Abhyankar {
266435aa4fcfSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
266535aa4fcfSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
266635aa4fcfSShri Abhyankar   PetscErrorCode    ierr;
2667b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2668b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2669b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
2670b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2671b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2672b3260449SShri Abhyankar   const PetscScalar *b;
267335aa4fcfSShri Abhyankar 
267435aa4fcfSShri Abhyankar   PetscFunctionBegin;
26753649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
267635aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
267735aa4fcfSShri Abhyankar   t  = a->solve_work;
267835aa4fcfSShri Abhyankar 
267935aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
268035aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
268135aa4fcfSShri Abhyankar 
268235aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
268335aa4fcfSShri Abhyankar   idx    = 7*r[0];
268435aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
268535aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
268635aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
268735aa4fcfSShri Abhyankar 
268835aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
268935aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
269035aa4fcfSShri Abhyankar     vi    = aj + ai[i];
269135aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
269235aa4fcfSShri Abhyankar     idx   = 7*r[i];
269335aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
269435aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
269535aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
269635aa4fcfSShri Abhyankar       idx   = 7*vi[m];
269735aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
269835aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
269935aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
270035aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
270135aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
270235aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
270335aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
270435aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
270535aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
270635aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
270735aa4fcfSShri Abhyankar       v += 49;
270835aa4fcfSShri Abhyankar     }
270935aa4fcfSShri Abhyankar     idx = 7*i;
271035aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
271135aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
271235aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
271335aa4fcfSShri Abhyankar   }
271435aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
271535aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
271635aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
271735aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
271835aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
271935aa4fcfSShri Abhyankar     idt  = 7*i;
272035aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
272135aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
272235aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
272335aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
272435aa4fcfSShri Abhyankar       idx   = 7*vi[m];
272535aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
272635aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
272735aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
272835aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
272935aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
273035aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
273135aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
273235aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
273335aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
273435aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
273535aa4fcfSShri Abhyankar       v += 49;
273635aa4fcfSShri Abhyankar     }
273735aa4fcfSShri Abhyankar     idc = 7*c[i];
273835aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
273935aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
274035aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
274135aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
274235aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
274335aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
274435aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
274535aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
274635aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
274735aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
274835aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
274935aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
275035aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
275135aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
275235aa4fcfSShri Abhyankar   }
275335aa4fcfSShri Abhyankar 
275435aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
275535aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
27563649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
275735aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
275835aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
275935aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
276035aa4fcfSShri Abhyankar }
276135aa4fcfSShri Abhyankar 
276235aa4fcfSShri Abhyankar #undef __FUNCT__
276306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
276406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
276515091d37SBarry Smith {
276615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2767b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2768dfbe8321SBarry Smith   PetscErrorCode    ierr;
2769b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
2770d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2771d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2772d9fead3dSBarry Smith   const PetscScalar *b;
277315091d37SBarry Smith 
277415091d37SBarry Smith   PetscFunctionBegin;
27753649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
27761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
277715091d37SBarry Smith   /* forward solve the lower triangular */
277815091d37SBarry Smith   idx    = 0;
277915091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
278015091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
278115091d37SBarry Smith   x[6] = b[6+idx];
278215091d37SBarry Smith   for (i=1; i<n; i++) {
278315091d37SBarry Smith     v     =  aa + 49*ai[i];
278415091d37SBarry Smith     vi    =  aj + ai[i];
278515091d37SBarry Smith     nz    =  diag[i] - ai[i];
278615091d37SBarry Smith     idx   =  7*i;
2787f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2788f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2789f1af5d2fSBarry Smith     s7  =  b[6+idx];
279015091d37SBarry Smith     while (nz--) {
279115091d37SBarry Smith       jdx   = 7*(*vi++);
279215091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
279315091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
279415091d37SBarry Smith       x7    = x[6+jdx];
2795f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2796f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2797f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2798f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2799f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2800f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2801f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
280215091d37SBarry Smith       v += 49;
280315091d37SBarry Smith      }
2804f1af5d2fSBarry Smith     x[idx]   = s1;
2805f1af5d2fSBarry Smith     x[1+idx] = s2;
2806f1af5d2fSBarry Smith     x[2+idx] = s3;
2807f1af5d2fSBarry Smith     x[3+idx] = s4;
2808f1af5d2fSBarry Smith     x[4+idx] = s5;
2809f1af5d2fSBarry Smith     x[5+idx] = s6;
2810f1af5d2fSBarry Smith     x[6+idx] = s7;
281115091d37SBarry Smith   }
281215091d37SBarry Smith   /* backward solve the upper triangular */
281315091d37SBarry Smith   for (i=n-1; i>=0; i--){
281415091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
281515091d37SBarry Smith     vi   = aj + diag[i] + 1;
281615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
281715091d37SBarry Smith     idt  = 7*i;
2818f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2819f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2820f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
2821f1af5d2fSBarry Smith     s7 = x[6+idt];
282215091d37SBarry Smith     while (nz--) {
282315091d37SBarry Smith       idx   = 7*(*vi++);
282415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
282515091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
282615091d37SBarry Smith       x7    = x[6+idx];
2827f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2828f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2829f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2830f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2831f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2832f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2833f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
283415091d37SBarry Smith       v += 49;
283515091d37SBarry Smith     }
283615091d37SBarry Smith     v        = aa + 49*diag[i];
2837f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2838f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2839f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2840f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2841f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2842f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2843f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2844f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2845f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2846f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2847f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2848f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2849f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2850f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
285115091d37SBarry Smith   }
285215091d37SBarry Smith 
28533649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
28541ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2855dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
285615091d37SBarry Smith   PetscFunctionReturn(0);
285715091d37SBarry Smith }
285815091d37SBarry Smith 
2859cee9d6f2SShri Abhyankar #undef __FUNCT__
28604dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
28614dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
286253cca76cSShri Abhyankar {
286353cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2864b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
286553cca76cSShri Abhyankar     PetscErrorCode    ierr;
2866b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
2867b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
286853cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
286953cca76cSShri Abhyankar     PetscScalar       *x;
287053cca76cSShri Abhyankar     const PetscScalar *b;
287153cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
287253cca76cSShri Abhyankar 
287353cca76cSShri Abhyankar     PetscFunctionBegin;
28743649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
287553cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
287653cca76cSShri Abhyankar     /* forward solve the lower triangular */
287753cca76cSShri Abhyankar     idx    = 0;
287853cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
287953cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
288053cca76cSShri Abhyankar     for (i=1; i<n; i++) {
288153cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
288253cca76cSShri Abhyankar        vi   = aj + ai[i];
288353cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
288453cca76cSShri Abhyankar       idx   = bs*i;
288553cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
288653cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
288753cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
288853cca76cSShri Abhyankar           jdx   = bs*vi[k];
288953cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
289053cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
289153cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
289253cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
289353cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
289453cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
289553cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
289653cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
289753cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
289853cca76cSShri Abhyankar           v   +=  bs2;
289953cca76cSShri Abhyankar         }
290053cca76cSShri Abhyankar 
290153cca76cSShri Abhyankar        x[idx]   = s1;
290253cca76cSShri Abhyankar        x[1+idx] = s2;
290353cca76cSShri Abhyankar        x[2+idx] = s3;
290453cca76cSShri Abhyankar        x[3+idx] = s4;
290553cca76cSShri Abhyankar        x[4+idx] = s5;
290653cca76cSShri Abhyankar        x[5+idx] = s6;
290753cca76cSShri Abhyankar        x[6+idx] = s7;
290853cca76cSShri Abhyankar     }
290953cca76cSShri Abhyankar 
291053cca76cSShri Abhyankar    /* backward solve the upper triangular */
291153cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
291253cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
291353cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
291453cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
291553cca76cSShri Abhyankar      idt = bs*i;
291653cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
291753cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
291853cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
291953cca76cSShri Abhyankar       idx   = bs*vi[k];
292053cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
292153cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
292253cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
292353cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
292453cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
292553cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
292653cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
292753cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
292853cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
292953cca76cSShri Abhyankar         v   +=  bs2;
293053cca76cSShri Abhyankar     }
293153cca76cSShri Abhyankar     /* x = inv_diagonal*x */
293253cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
293353cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
293453cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
293553cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
293653cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
293753cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
293853cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
293953cca76cSShri Abhyankar   }
294053cca76cSShri Abhyankar 
29413649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
294253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
294353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
294453cca76cSShri Abhyankar   PetscFunctionReturn(0);
294553cca76cSShri Abhyankar }
294653cca76cSShri Abhyankar 
294753cca76cSShri Abhyankar #undef __FUNCT__
294806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
294906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
295015091d37SBarry Smith {
295115091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
295215091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
29536849ba73SBarry Smith   PetscErrorCode    ierr;
29545d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
2955b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2956b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2957d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2958d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2959d9fead3dSBarry Smith   const PetscScalar *b;
2960b3260449SShri Abhyankar 
296115091d37SBarry Smith   PetscFunctionBegin;
29623649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
29631ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2964f1af5d2fSBarry Smith   t  = a->solve_work;
296515091d37SBarry Smith 
296615091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
296715091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
296815091d37SBarry Smith 
296915091d37SBarry Smith   /* forward solve the lower triangular */
297015091d37SBarry Smith   idx    = 6*(*r++);
2971f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2972f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
2973f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
297415091d37SBarry Smith   for (i=1; i<n; i++) {
297515091d37SBarry Smith     v     = aa + 36*ai[i];
297615091d37SBarry Smith     vi    = aj + ai[i];
297715091d37SBarry Smith     nz    = diag[i] - ai[i];
297815091d37SBarry Smith     idx   = 6*(*r++);
2979f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2980f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
298115091d37SBarry Smith     while (nz--) {
298215091d37SBarry Smith       idx   = 6*(*vi++);
2983f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2984f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2985f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2986f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2987f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2988f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2989f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2990f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
299115091d37SBarry Smith       v += 36;
299215091d37SBarry Smith     }
299315091d37SBarry Smith     idx = 6*i;
2994f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2995f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
2996f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
299715091d37SBarry Smith   }
299815091d37SBarry Smith   /* backward solve the upper triangular */
299915091d37SBarry Smith   for (i=n-1; i>=0; i--){
300015091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
300115091d37SBarry Smith     vi   = aj + diag[i] + 1;
300215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
300315091d37SBarry Smith     idt  = 6*i;
3004f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3005f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
3006f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
300715091d37SBarry Smith     while (nz--) {
300815091d37SBarry Smith       idx   = 6*(*vi++);
3009f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3010f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3011f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
3012f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3013f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3014f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3015f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3016f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3017f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
301815091d37SBarry Smith       v += 36;
301915091d37SBarry Smith     }
302015091d37SBarry Smith     idc = 6*(*c--);
302115091d37SBarry Smith     v   = aa + 36*diag[i];
3022f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3023f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
3024f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3025f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
3026f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3027f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
3028f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3029f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
3030f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3031f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
3032f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3033f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
303415091d37SBarry Smith   }
303515091d37SBarry Smith 
303615091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
303715091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30383649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
30391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3040dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
304115091d37SBarry Smith   PetscFunctionReturn(0);
304215091d37SBarry Smith }
304315091d37SBarry Smith 
30446506fda5SShri Abhyankar #undef __FUNCT__
30454dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6"
30464dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
30476506fda5SShri Abhyankar {
30486506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
30496506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
30506506fda5SShri Abhyankar   PetscErrorCode    ierr;
30516506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3052b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3053b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
30546506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
30556506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
30566506fda5SShri Abhyankar   const PetscScalar *b;
3057b3260449SShri Abhyankar 
30586506fda5SShri Abhyankar   PetscFunctionBegin;
30593649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
30606506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
30616506fda5SShri Abhyankar   t  = a->solve_work;
30626506fda5SShri Abhyankar 
30636506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
30646506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
30656506fda5SShri Abhyankar 
30666506fda5SShri Abhyankar   /* forward solve the lower triangular */
30676506fda5SShri Abhyankar   idx    = 6*r[0];
30686506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
30696506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
30706506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
30716506fda5SShri Abhyankar   for (i=1; i<n; i++) {
30726506fda5SShri Abhyankar     v     = aa + 36*ai[i];
30736506fda5SShri Abhyankar     vi    = aj + ai[i];
30746506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
30756506fda5SShri Abhyankar     idx   = 6*r[i];
30766506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
30776506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
30786506fda5SShri Abhyankar     for(m=0;m<nz;m++){
30796506fda5SShri Abhyankar       idx   = 6*vi[m];
30806506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
30816506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
30826506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
30836506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
30846506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
30856506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
30866506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
30876506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
30886506fda5SShri Abhyankar       v += 36;
30896506fda5SShri Abhyankar     }
30906506fda5SShri Abhyankar     idx = 6*i;
30916506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
30926506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
30936506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
30946506fda5SShri Abhyankar   }
30956506fda5SShri Abhyankar   /* backward solve the upper triangular */
30966506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
30976506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
30986506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
30996506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
31006506fda5SShri Abhyankar     idt  = 6*i;
31016506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
31026506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
31036506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
31046506fda5SShri Abhyankar     for(m=0;m<nz;m++){
31056506fda5SShri Abhyankar       idx   = 6*vi[m];
31066506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
31076506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
31086506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
31096506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
31106506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
31116506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
31126506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
31136506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
31146506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
31156506fda5SShri Abhyankar       v += 36;
31166506fda5SShri Abhyankar     }
31176506fda5SShri Abhyankar     idc = 6*c[i];
31186506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
31196506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
31206506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
31216506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
31226506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
31236506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
31246506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
31256506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
31266506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
31276506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
31286506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
31296506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
31306506fda5SShri Abhyankar   }
31316506fda5SShri Abhyankar 
31326506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
31336506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
31343649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
31356506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
31366506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
31376506fda5SShri Abhyankar   PetscFunctionReturn(0);
31386506fda5SShri Abhyankar }
31398f690400SShri Abhyankar 
31408f690400SShri Abhyankar #undef __FUNCT__
314106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
314206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
314315091d37SBarry Smith {
314415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3145b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3146dfbe8321SBarry Smith   PetscErrorCode    ierr;
3147b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3148d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3149d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3150d9fead3dSBarry Smith   const PetscScalar *b;
315115091d37SBarry Smith 
315215091d37SBarry Smith   PetscFunctionBegin;
31533649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
31541ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
315515091d37SBarry Smith   /* forward solve the lower triangular */
315615091d37SBarry Smith   idx    = 0;
315715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
315815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
315915091d37SBarry Smith   for (i=1; i<n; i++) {
316015091d37SBarry Smith     v     =  aa + 36*ai[i];
316115091d37SBarry Smith     vi    =  aj + ai[i];
316215091d37SBarry Smith     nz    =  diag[i] - ai[i];
316315091d37SBarry Smith     idx   =  6*i;
3164f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3165f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
316615091d37SBarry Smith     while (nz--) {
316715091d37SBarry Smith       jdx   = 6*(*vi++);
316815091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
316915091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3170f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3171f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3172f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3173f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3174f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3175f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
317615091d37SBarry Smith       v += 36;
317715091d37SBarry Smith      }
3178f1af5d2fSBarry Smith     x[idx]   = s1;
3179f1af5d2fSBarry Smith     x[1+idx] = s2;
3180f1af5d2fSBarry Smith     x[2+idx] = s3;
3181f1af5d2fSBarry Smith     x[3+idx] = s4;
3182f1af5d2fSBarry Smith     x[4+idx] = s5;
3183f1af5d2fSBarry Smith     x[5+idx] = s6;
318415091d37SBarry Smith   }
318515091d37SBarry Smith   /* backward solve the upper triangular */
318615091d37SBarry Smith   for (i=n-1; i>=0; i--){
318715091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
318815091d37SBarry Smith     vi   = aj + diag[i] + 1;
318915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
319015091d37SBarry Smith     idt  = 6*i;
3191f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
3192f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
3193f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
319415091d37SBarry Smith     while (nz--) {
319515091d37SBarry Smith       idx   = 6*(*vi++);
319615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
319715091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3198f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3199f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3200f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3201f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3202f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3203f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
320415091d37SBarry Smith       v += 36;
320515091d37SBarry Smith     }
320615091d37SBarry Smith     v        = aa + 36*diag[i];
3207f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3208f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3209f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3210f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3211f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3212f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
321315091d37SBarry Smith   }
321415091d37SBarry Smith 
32153649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
32161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3217dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
321815091d37SBarry Smith   PetscFunctionReturn(0);
321915091d37SBarry Smith }
322015091d37SBarry Smith 
3221cee9d6f2SShri Abhyankar #undef __FUNCT__
32224dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
32234dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
322453cca76cSShri Abhyankar {
322553cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3226b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
322753cca76cSShri Abhyankar     PetscErrorCode    ierr;
3228b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
3229b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
323053cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
323153cca76cSShri Abhyankar     PetscScalar       *x;
323253cca76cSShri Abhyankar     const PetscScalar *b;
323353cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
323453cca76cSShri Abhyankar 
323553cca76cSShri Abhyankar     PetscFunctionBegin;
32363649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
323753cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
323853cca76cSShri Abhyankar     /* forward solve the lower triangular */
323953cca76cSShri Abhyankar     idx    = 0;
324053cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
324153cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
324253cca76cSShri Abhyankar     for (i=1; i<n; i++) {
324353cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
324453cca76cSShri Abhyankar        vi   = aj + ai[i];
324553cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
324653cca76cSShri Abhyankar       idx   = bs*i;
324753cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
324853cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
324953cca76cSShri Abhyankar        for(k=0;k<nz;k++){
325053cca76cSShri Abhyankar           jdx   = bs*vi[k];
325153cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
325253cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
325353cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
325453cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
325553cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
325653cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
325753cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
325853cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
325953cca76cSShri Abhyankar           v   +=  bs2;
326053cca76cSShri Abhyankar         }
326153cca76cSShri Abhyankar 
326253cca76cSShri Abhyankar        x[idx]   = s1;
326353cca76cSShri Abhyankar        x[1+idx] = s2;
326453cca76cSShri Abhyankar        x[2+idx] = s3;
326553cca76cSShri Abhyankar        x[3+idx] = s4;
326653cca76cSShri Abhyankar        x[4+idx] = s5;
326753cca76cSShri Abhyankar        x[5+idx] = s6;
326853cca76cSShri Abhyankar     }
326953cca76cSShri Abhyankar 
327053cca76cSShri Abhyankar    /* backward solve the upper triangular */
327153cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
327253cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
327353cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
327453cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
327553cca76cSShri Abhyankar      idt = bs*i;
327653cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
327753cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
327853cca76cSShri Abhyankar      for(k=0;k<nz;k++){
327953cca76cSShri Abhyankar       idx   = bs*vi[k];
328053cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
328153cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
328253cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
328353cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
328453cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
328553cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
328653cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
328753cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
328853cca76cSShri Abhyankar         v   +=  bs2;
328953cca76cSShri Abhyankar     }
329053cca76cSShri Abhyankar     /* x = inv_diagonal*x */
329153cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
329253cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
329353cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
329453cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
329553cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
329653cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
329753cca76cSShri Abhyankar   }
329853cca76cSShri Abhyankar 
32993649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
330053cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
330153cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
330253cca76cSShri Abhyankar   PetscFunctionReturn(0);
330353cca76cSShri Abhyankar }
330453cca76cSShri Abhyankar 
330553cca76cSShri Abhyankar #undef __FUNCT__
330606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
330706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
33084e2b4712SSatish Balay {
33094e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
33104e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
33116849ba73SBarry Smith   PetscErrorCode    ierr;
33125d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3313b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3314b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
3315d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3316d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3317d9fead3dSBarry Smith   const PetscScalar *b;
33184e2b4712SSatish Balay 
33194e2b4712SSatish Balay   PetscFunctionBegin;
33203649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
33211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3322f1af5d2fSBarry Smith   t  = a->solve_work;
33234e2b4712SSatish Balay 
33244e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
33254e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
33264e2b4712SSatish Balay 
33274e2b4712SSatish Balay   /* forward solve the lower triangular */
33284e2b4712SSatish Balay   idx    = 5*(*r++);
3329f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3330f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
33314e2b4712SSatish Balay   for (i=1; i<n; i++) {
33324e2b4712SSatish Balay     v     = aa + 25*ai[i];
33334e2b4712SSatish Balay     vi    = aj + ai[i];
33344e2b4712SSatish Balay     nz    = diag[i] - ai[i];
33354e2b4712SSatish Balay     idx   = 5*(*r++);
3336f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3337f1af5d2fSBarry Smith     s5  = b[4+idx];
33384e2b4712SSatish Balay     while (nz--) {
33394e2b4712SSatish Balay       idx   = 5*(*vi++);
3340f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3341f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
3342f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3343f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3344f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3345f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3346f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
33474e2b4712SSatish Balay       v += 25;
33484e2b4712SSatish Balay     }
33494e2b4712SSatish Balay     idx = 5*i;
3350f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3351f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
33524e2b4712SSatish Balay   }
33534e2b4712SSatish Balay   /* backward solve the upper triangular */
33544e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
33554e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
33564e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
33574e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
33584e2b4712SSatish Balay     idt  = 5*i;
3359f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3360f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
33614e2b4712SSatish Balay     while (nz--) {
33624e2b4712SSatish Balay       idx   = 5*(*vi++);
3363f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3364f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3365f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3366f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3367f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3368f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3369f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
33704e2b4712SSatish Balay       v += 25;
33714e2b4712SSatish Balay     }
33724e2b4712SSatish Balay     idc = 5*(*c--);
33734e2b4712SSatish Balay     v   = aa + 25*diag[i];
3374f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3375f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
3376f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3377f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
3378f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3379f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
3380f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3381f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
3382f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3383f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
33844e2b4712SSatish Balay   }
33854e2b4712SSatish Balay 
33864e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
33874e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
33883649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
33891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3390dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
33914e2b4712SSatish Balay   PetscFunctionReturn(0);
33924e2b4712SSatish Balay }
33934e2b4712SSatish Balay 
339478bb4007SShri Abhyankar #undef __FUNCT__
33954dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5"
33964dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
339778bb4007SShri Abhyankar {
339878bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
339978bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
340078bb4007SShri Abhyankar   PetscErrorCode    ierr;
340178bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3402b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3403b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
340478bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
340578bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
340678bb4007SShri Abhyankar   const PetscScalar *b;
340778bb4007SShri Abhyankar 
340878bb4007SShri Abhyankar   PetscFunctionBegin;
34093649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
341078bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
341178bb4007SShri Abhyankar   t  = a->solve_work;
341278bb4007SShri Abhyankar 
341378bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
341478bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
341578bb4007SShri Abhyankar 
341678bb4007SShri Abhyankar   /* forward solve the lower triangular */
341778bb4007SShri Abhyankar   idx    = 5*r[0];
341878bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
341978bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
342078bb4007SShri Abhyankar   for (i=1; i<n; i++) {
342178bb4007SShri Abhyankar     v     = aa + 25*ai[i];
342278bb4007SShri Abhyankar     vi    = aj + ai[i];
342378bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
342478bb4007SShri Abhyankar     idx   = 5*r[i];
342578bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
342678bb4007SShri Abhyankar     s5  = b[4+idx];
342778bb4007SShri Abhyankar     for(m=0;m<nz;m++){
342878bb4007SShri Abhyankar       idx   = 5*vi[m];
342978bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
343078bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
343178bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
343278bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
343378bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
343478bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
343578bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
343678bb4007SShri Abhyankar       v += 25;
343778bb4007SShri Abhyankar     }
343878bb4007SShri Abhyankar     idx = 5*i;
343978bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
344078bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
344178bb4007SShri Abhyankar   }
344278bb4007SShri Abhyankar   /* backward solve the upper triangular */
344378bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
344478bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
344578bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
344678bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
344778bb4007SShri Abhyankar     idt  = 5*i;
344878bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
344978bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
345078bb4007SShri Abhyankar     for(m=0;m<nz;m++){
345178bb4007SShri Abhyankar       idx   = 5*vi[m];
345278bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
345378bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
345478bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
345578bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
345678bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
345778bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
345878bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
345978bb4007SShri Abhyankar       v += 25;
346078bb4007SShri Abhyankar     }
346178bb4007SShri Abhyankar     idc = 5*c[i];
346278bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
346378bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
346478bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
346578bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
346678bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
346778bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
346878bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
346978bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
347078bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
347178bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
347278bb4007SShri Abhyankar   }
347378bb4007SShri Abhyankar 
347478bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
347578bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
34763649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
347778bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
347878bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
347978bb4007SShri Abhyankar   PetscFunctionReturn(0);
348078bb4007SShri Abhyankar }
348178bb4007SShri Abhyankar 
34828f690400SShri Abhyankar #undef __FUNCT__
348306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
348406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
348515091d37SBarry Smith {
348615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3487b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3488b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3489dfbe8321SBarry Smith   PetscErrorCode    ierr;
3490d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3491d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3492d9fead3dSBarry Smith   const PetscScalar *b;
349315091d37SBarry Smith 
349415091d37SBarry Smith   PetscFunctionBegin;
34953649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
34961ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
349715091d37SBarry Smith   /* forward solve the lower triangular */
349815091d37SBarry Smith   idx    = 0;
349915091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
350015091d37SBarry Smith   for (i=1; i<n; i++) {
350115091d37SBarry Smith     v     =  aa + 25*ai[i];
350215091d37SBarry Smith     vi    =  aj + ai[i];
350315091d37SBarry Smith     nz    =  diag[i] - ai[i];
350415091d37SBarry Smith     idx   =  5*i;
3505f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
350615091d37SBarry Smith     while (nz--) {
350715091d37SBarry Smith       jdx   = 5*(*vi++);
350815091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3509f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3510f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3511f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3512f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3513f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
351415091d37SBarry Smith       v    += 25;
351515091d37SBarry Smith     }
3516f1af5d2fSBarry Smith     x[idx]   = s1;
3517f1af5d2fSBarry Smith     x[1+idx] = s2;
3518f1af5d2fSBarry Smith     x[2+idx] = s3;
3519f1af5d2fSBarry Smith     x[3+idx] = s4;
3520f1af5d2fSBarry Smith     x[4+idx] = s5;
352115091d37SBarry Smith   }
352215091d37SBarry Smith   /* backward solve the upper triangular */
352315091d37SBarry Smith   for (i=n-1; i>=0; i--){
352415091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
352515091d37SBarry Smith     vi   = aj + diag[i] + 1;
352615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
352715091d37SBarry Smith     idt  = 5*i;
3528f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3529f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
353015091d37SBarry Smith     while (nz--) {
353115091d37SBarry Smith       idx   = 5*(*vi++);
353215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3533f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3534f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3535f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3536f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3537f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
353815091d37SBarry Smith       v    += 25;
353915091d37SBarry Smith     }
354015091d37SBarry Smith     v        = aa + 25*diag[i];
3541f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3542f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3543f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3544f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3545f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
354615091d37SBarry Smith   }
354715091d37SBarry Smith 
35483649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
35491ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3550dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
355115091d37SBarry Smith   PetscFunctionReturn(0);
355215091d37SBarry Smith }
355315091d37SBarry Smith 
3554cee9d6f2SShri Abhyankar #undef __FUNCT__
35554dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
35564dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
355753cca76cSShri Abhyankar {
355853cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3559b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3560b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,idt,jdx;
356153cca76cSShri Abhyankar   PetscErrorCode    ierr;
356253cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
356353cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
356453cca76cSShri Abhyankar   const PetscScalar *b;
356553cca76cSShri Abhyankar 
356653cca76cSShri Abhyankar   PetscFunctionBegin;
35673649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
356853cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
356953cca76cSShri Abhyankar   /* forward solve the lower triangular */
357053cca76cSShri Abhyankar   idx    = 0;
357153cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
357253cca76cSShri Abhyankar   for (i=1; i<n; i++) {
357353cca76cSShri Abhyankar     v   = aa + 25*ai[i];
357453cca76cSShri Abhyankar     vi  = aj + ai[i];
357553cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
357653cca76cSShri Abhyankar     idx = 5*i;
357753cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
357853cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
357953cca76cSShri Abhyankar       jdx   = 5*vi[k];
358053cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
358153cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
358253cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
358353cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
358453cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
358553cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
358653cca76cSShri Abhyankar       v    += 25;
358753cca76cSShri Abhyankar     }
358853cca76cSShri Abhyankar     x[idx]   = s1;
358953cca76cSShri Abhyankar     x[1+idx] = s2;
359053cca76cSShri Abhyankar     x[2+idx] = s3;
359153cca76cSShri Abhyankar     x[3+idx] = s4;
359253cca76cSShri Abhyankar     x[4+idx] = s5;
359353cca76cSShri Abhyankar   }
359453cca76cSShri Abhyankar 
359553cca76cSShri Abhyankar   /* backward solve the upper triangular */
359653cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
359753cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
359853cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
359953cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
360053cca76cSShri Abhyankar     idt = 5*i;
360153cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
360253cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360353cca76cSShri Abhyankar     for(k=0;k<nz;k++){
360453cca76cSShri Abhyankar       idx   = 5*vi[k];
360553cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
360653cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
360753cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
360853cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
360953cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
361053cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
361153cca76cSShri Abhyankar       v    += 25;
361253cca76cSShri Abhyankar     }
361353cca76cSShri Abhyankar     /* x = inv_diagonal*x */
361453cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
361553cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
361653cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
361753cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
361853cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
361953cca76cSShri Abhyankar   }
362053cca76cSShri Abhyankar 
36213649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
362253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
362353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
362453cca76cSShri Abhyankar   PetscFunctionReturn(0);
362553cca76cSShri Abhyankar }
362653cca76cSShri Abhyankar 
362753cca76cSShri Abhyankar #undef __FUNCT__
362806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
362906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
36304e2b4712SSatish Balay {
36314e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
36324e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
36336849ba73SBarry Smith   PetscErrorCode    ierr;
3634b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3635b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
36365d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3637d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3638d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3639d9fead3dSBarry Smith   const PetscScalar *b;
36404e2b4712SSatish Balay 
36414e2b4712SSatish Balay   PetscFunctionBegin;
36423649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
36431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3644f1af5d2fSBarry Smith   t  = a->solve_work;
36454e2b4712SSatish Balay 
36464e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
36474e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
36484e2b4712SSatish Balay 
36494e2b4712SSatish Balay   /* forward solve the lower triangular */
36504e2b4712SSatish Balay   idx    = 4*(*r++);
3651f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3652f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
36534e2b4712SSatish Balay   for (i=1; i<n; i++) {
36544e2b4712SSatish Balay     v     = aa + 16*ai[i];
36554e2b4712SSatish Balay     vi    = aj + ai[i];
36564e2b4712SSatish Balay     nz    = diag[i] - ai[i];
36574e2b4712SSatish Balay     idx   = 4*(*r++);
3658f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
36594e2b4712SSatish Balay     while (nz--) {
36604e2b4712SSatish Balay       idx   = 4*(*vi++);
3661f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3662f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3663f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3664f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3665f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
36664e2b4712SSatish Balay       v    += 16;
36674e2b4712SSatish Balay     }
36684e2b4712SSatish Balay     idx        = 4*i;
3669f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3670f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
36714e2b4712SSatish Balay   }
36724e2b4712SSatish Balay   /* backward solve the upper triangular */
36734e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
36744e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
36754e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
36764e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
36774e2b4712SSatish Balay     idt  = 4*i;
3678f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3679f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
36804e2b4712SSatish Balay     while (nz--) {
36814e2b4712SSatish Balay       idx   = 4*(*vi++);
3682f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3683f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3684f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3685f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3686f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3687f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
36884e2b4712SSatish Balay       v += 16;
36894e2b4712SSatish Balay     }
36904e2b4712SSatish Balay     idc      = 4*(*c--);
36914e2b4712SSatish Balay     v        = aa + 16*diag[i];
3692f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3693f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3694f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3695f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
36964e2b4712SSatish Balay   }
36974e2b4712SSatish Balay 
36984e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
36994e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
37003649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
37011ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3702dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
37034e2b4712SSatish Balay   PetscFunctionReturn(0);
37044e2b4712SSatish Balay }
3705f26ec98cSKris Buschelman 
37068f690400SShri Abhyankar #undef __FUNCT__
37074dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4"
37084dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
370978bb4007SShri Abhyankar {
371078bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
371178bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
371278bb4007SShri Abhyankar   PetscErrorCode    ierr;
3713b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3714b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
371578bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
371678bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
371778bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
371878bb4007SShri Abhyankar   const PetscScalar *b;
371978bb4007SShri Abhyankar 
372078bb4007SShri Abhyankar   PetscFunctionBegin;
37213649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
372278bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
372378bb4007SShri Abhyankar   t  = a->solve_work;
372478bb4007SShri Abhyankar 
372578bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
372678bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
372778bb4007SShri Abhyankar 
372878bb4007SShri Abhyankar   /* forward solve the lower triangular */
372978bb4007SShri Abhyankar   idx    = 4*r[0];
373078bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
373178bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
373278bb4007SShri Abhyankar   for (i=1; i<n; i++) {
373378bb4007SShri Abhyankar     v     = aa + 16*ai[i];
373478bb4007SShri Abhyankar     vi    = aj + ai[i];
373578bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
373678bb4007SShri Abhyankar     idx   = 4*r[i];
373778bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
373878bb4007SShri Abhyankar     for(m=0;m<nz;m++){
373978bb4007SShri Abhyankar       idx   = 4*vi[m];
374078bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
374178bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
374278bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
374378bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
374478bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
374578bb4007SShri Abhyankar       v    += 16;
374678bb4007SShri Abhyankar     }
374778bb4007SShri Abhyankar     idx        = 4*i;
374878bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
374978bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
375078bb4007SShri Abhyankar   }
375178bb4007SShri Abhyankar   /* backward solve the upper triangular */
375278bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
375378bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
375478bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
375578bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
375678bb4007SShri Abhyankar     idt  = 4*i;
375778bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
375878bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
375978bb4007SShri Abhyankar     for(m=0;m<nz;m++){
376078bb4007SShri Abhyankar       idx   = 4*vi[m];
376178bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
376278bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
376378bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
376478bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
376578bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
376678bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
376778bb4007SShri Abhyankar       v += 16;
376878bb4007SShri Abhyankar     }
376978bb4007SShri Abhyankar     idc      = 4*c[i];
377078bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
377178bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
377278bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
377378bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
377478bb4007SShri Abhyankar   }
377578bb4007SShri Abhyankar 
377678bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
377778bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
37783649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
377978bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
378078bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
378178bb4007SShri Abhyankar   PetscFunctionReturn(0);
378278bb4007SShri Abhyankar }
378378bb4007SShri Abhyankar 
378478bb4007SShri Abhyankar #undef __FUNCT__
3785f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3786dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3787f26ec98cSKris Buschelman {
3788f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3789f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
37906849ba73SBarry Smith   PetscErrorCode    ierr;
3791b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3792b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
37935d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3794d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3795d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3796d9fead3dSBarry Smith   PetscScalar       *x;
3797d9fead3dSBarry Smith   const PetscScalar *b;
3798f26ec98cSKris Buschelman 
3799f26ec98cSKris Buschelman   PetscFunctionBegin;
38003649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
38011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3802f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
3803f26ec98cSKris Buschelman 
3804f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3805f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3806f26ec98cSKris Buschelman 
3807f26ec98cSKris Buschelman   /* forward solve the lower triangular */
3808f26ec98cSKris Buschelman   idx    = 4*(*r++);
3809f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
3810f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
3811f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
3812f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
3813f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
3814f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
3815f26ec98cSKris Buschelman     vi    = aj + ai[i];
3816f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
3817f26ec98cSKris Buschelman     idx   = 4*(*r++);
3818f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
3819f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
3820f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
3821f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
3822f26ec98cSKris Buschelman     while (nz--) {
3823f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3824f26ec98cSKris Buschelman       x1  = t[idx];
3825f26ec98cSKris Buschelman       x2  = t[1+idx];
3826f26ec98cSKris Buschelman       x3  = t[2+idx];
3827f26ec98cSKris Buschelman       x4  = t[3+idx];
3828f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3829f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3830f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3831f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3832f26ec98cSKris Buschelman       v    += 16;
3833f26ec98cSKris Buschelman     }
3834f26ec98cSKris Buschelman     idx        = 4*i;
3835f26ec98cSKris Buschelman     t[idx]   = s1;
3836f26ec98cSKris Buschelman     t[1+idx] = s2;
3837f26ec98cSKris Buschelman     t[2+idx] = s3;
3838f26ec98cSKris Buschelman     t[3+idx] = s4;
3839f26ec98cSKris Buschelman   }
3840f26ec98cSKris Buschelman   /* backward solve the upper triangular */
3841f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
3842f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
3843f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
3844f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
3845f26ec98cSKris Buschelman     idt  = 4*i;
3846f26ec98cSKris Buschelman     s1 = t[idt];
3847f26ec98cSKris Buschelman     s2 = t[1+idt];
3848f26ec98cSKris Buschelman     s3 = t[2+idt];
3849f26ec98cSKris Buschelman     s4 = t[3+idt];
3850f26ec98cSKris Buschelman     while (nz--) {
3851f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3852f26ec98cSKris Buschelman       x1  = t[idx];
3853f26ec98cSKris Buschelman       x2  = t[1+idx];
3854f26ec98cSKris Buschelman       x3  = t[2+idx];
3855f26ec98cSKris Buschelman       x4  = t[3+idx];
3856f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3857f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3858f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3859f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3860f26ec98cSKris Buschelman       v += 16;
3861f26ec98cSKris Buschelman     }
3862f26ec98cSKris Buschelman     idc      = 4*(*c--);
3863f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3864f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3865f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3866f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3867f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3868f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3869f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3870f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3871f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3872f26ec98cSKris Buschelman  }
3873f26ec98cSKris Buschelman 
3874f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3875f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
38763649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
38771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3878dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3879f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3880f26ec98cSKris Buschelman }
3881f26ec98cSKris Buschelman 
388224c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
388324c233c2SKris Buschelman 
388424c233c2SKris Buschelman #include PETSC_HAVE_SSE
388524c233c2SKris Buschelman 
388624c233c2SKris Buschelman #undef __FUNCT__
388724c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3888dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
388924c233c2SKris Buschelman {
389024c233c2SKris Buschelman   /*
389124c233c2SKris Buschelman      Note: This code uses demotion of double
389224c233c2SKris Buschelman      to float when performing the mixed-mode computation.
389324c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
389424c233c2SKris Buschelman   */
389524c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
389624c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
38976849ba73SBarry Smith   PetscErrorCode ierr;
38985d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
38995d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
390024c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
390187828ca2SBarry Smith   PetscScalar    *x,*b,*t;
390224c233c2SKris Buschelman 
390324c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
390424c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
390524c233c2SKris Buschelman   unsigned long   offset;
390624c233c2SKris Buschelman 
390724c233c2SKris Buschelman   PetscFunctionBegin;
390824c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
390924c233c2SKris Buschelman 
391024c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
391124c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
391224c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
391324c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
391424c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
391524c233c2SKris Buschelman 
39161ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
39171ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
391824c233c2SKris Buschelman     t  = a->solve_work;
391924c233c2SKris Buschelman 
392024c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
392124c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
392224c233c2SKris Buschelman 
392324c233c2SKris Buschelman     /* forward solve the lower triangular */
392424c233c2SKris Buschelman     idx  = 4*(*r++);
392524c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
392624c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
392724c233c2SKris Buschelman     v    =  aa + 16*ai[1];
392824c233c2SKris Buschelman 
392924c233c2SKris Buschelman     for (i=1; i<n;) {
393024c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
393124c233c2SKris Buschelman       vi   =  aj      + ai[i];
393224c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
393324c233c2SKris Buschelman       idx  =  4*(*r++);
393424c233c2SKris Buschelman 
393524c233c2SKris Buschelman       /* Demote sum from double to float */
393624c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
393724c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
393824c233c2SKris Buschelman 
393924c233c2SKris Buschelman       while (nz--) {
394024c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
394124c233c2SKris Buschelman         idx = 4*(*vi++);
394224c233c2SKris Buschelman 
394324c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
394424c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
394524c233c2SKris Buschelman 
394624c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
394724c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
394824c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
394924c233c2SKris Buschelman 
395024c233c2SKris Buschelman           /* First Column */
395124c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
395224c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
395324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
395424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
395524c233c2SKris Buschelman 
395624c233c2SKris Buschelman           /* Second Column */
395724c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
395824c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
395924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
396024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
396124c233c2SKris Buschelman 
396224c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
396324c233c2SKris Buschelman 
396424c233c2SKris Buschelman           /* Third Column */
396524c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
396624c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
396724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
396824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
396924c233c2SKris Buschelman 
397024c233c2SKris Buschelman           /* Fourth Column */
397124c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
397224c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
397324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
397424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
397524c233c2SKris Buschelman         SSE_INLINE_END_2
397624c233c2SKris Buschelman 
397724c233c2SKris Buschelman         v  += 16;
397824c233c2SKris Buschelman       }
397924c233c2SKris Buschelman       idx = 4*i;
398024c233c2SKris Buschelman       v   = aa + 16*ai[++i];
398124c233c2SKris Buschelman       PREFETCH_NTA(v);
398224c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
398324c233c2SKris Buschelman 
398424c233c2SKris Buschelman       /* Promote result from float to double */
398524c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
398624c233c2SKris Buschelman     }
398724c233c2SKris Buschelman     /* backward solve the upper triangular */
398824c233c2SKris Buschelman     idt  = 4*(n-1);
398924c233c2SKris Buschelman     ai16 = 16*diag[n-1];
399024c233c2SKris Buschelman     v    = aa + ai16 + 16;
399124c233c2SKris Buschelman     for (i=n-1; i>=0;){
399224c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
399324c233c2SKris Buschelman       vi = aj + diag[i] + 1;
399424c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
399524c233c2SKris Buschelman 
399624c233c2SKris Buschelman       /* Demote accumulator from double to float */
399724c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
399824c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
399924c233c2SKris Buschelman 
400024c233c2SKris Buschelman       while (nz--) {
400124c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
400224c233c2SKris Buschelman         idx = 4*(*vi++);
400324c233c2SKris Buschelman 
400424c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
400524c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
400624c233c2SKris Buschelman 
400724c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
400824c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
400924c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
401024c233c2SKris Buschelman 
401124c233c2SKris Buschelman           /* First Column */
401224c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
401324c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
401424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
401524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
401624c233c2SKris Buschelman 
401724c233c2SKris Buschelman           /* Second Column */
401824c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
401924c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
402024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
402124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
402224c233c2SKris Buschelman 
402324c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
402424c233c2SKris Buschelman 
402524c233c2SKris Buschelman           /* Third Column */
402624c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
402724c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
402824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
402924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
403024c233c2SKris Buschelman 
403124c233c2SKris Buschelman           /* Fourth Column */
403224c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
403324c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
403424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
403524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
403624c233c2SKris Buschelman         SSE_INLINE_END_2
403724c233c2SKris Buschelman         v  += 16;
403824c233c2SKris Buschelman       }
403924c233c2SKris Buschelman       v    = aa + ai16;
404024c233c2SKris Buschelman       ai16 = 16*diag[--i];
404124c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
404224c233c2SKris Buschelman       /*
404324c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
404424c233c2SKris Buschelman          which was inverted as part of the factorization
404524c233c2SKris Buschelman       */
404624c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
404724c233c2SKris Buschelman         /* First Column */
404824c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
404924c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
405024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
405124c233c2SKris Buschelman 
405224c233c2SKris Buschelman         /* Second Column */
405324c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
405424c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
405524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
405624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
405724c233c2SKris Buschelman 
405824c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
405924c233c2SKris Buschelman 
406024c233c2SKris Buschelman         /* Third Column */
406124c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
406224c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
406324c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
406424c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
406524c233c2SKris Buschelman 
406624c233c2SKris Buschelman         /* Fourth Column */
406724c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
406824c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
406924c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
407024c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
407124c233c2SKris Buschelman 
407224c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
407324c233c2SKris Buschelman       SSE_INLINE_END_3
407424c233c2SKris Buschelman 
407524c233c2SKris Buschelman       /* Promote solution from float to double */
407624c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
407724c233c2SKris Buschelman 
407824c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
407924c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
408024c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
408124c233c2SKris Buschelman       idc  = 4*(*c--);
408224c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
408324c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
408424c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
408524c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
408624c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
408724c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
408824c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
408924c233c2SKris Buschelman       SSE_INLINE_END_2
409024c233c2SKris Buschelman       v    = aa + ai16 + 16;
409124c233c2SKris Buschelman       idt -= 4;
409224c233c2SKris Buschelman     }
409324c233c2SKris Buschelman 
409424c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
409524c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
40961ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
40971ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4098dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
409924c233c2SKris Buschelman   SSE_SCOPE_END;
410024c233c2SKris Buschelman   PetscFunctionReturn(0);
410124c233c2SKris Buschelman }
410224c233c2SKris Buschelman 
410324c233c2SKris Buschelman #endif
41040ef38995SBarry Smith 
41050ef38995SBarry Smith 
41064e2b4712SSatish Balay /*
41074e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
41084e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
41094e2b4712SSatish Balay */
41104a2ae208SSatish Balay #undef __FUNCT__
411106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
411206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
41134e2b4712SSatish Balay {
41144e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4115356650c2SBarry Smith   PetscInt          n=a->mbs;
4116356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
4117dfbe8321SBarry Smith   PetscErrorCode    ierr;
4118356650c2SBarry Smith   const PetscInt    *diag = a->diag;
4119d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
4120d9fead3dSBarry Smith   PetscScalar       *x;
4121d9fead3dSBarry Smith   const PetscScalar *b;
41224e2b4712SSatish Balay 
41234e2b4712SSatish Balay   PetscFunctionBegin;
41243649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
41251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
41264e2b4712SSatish Balay 
4127aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
41282853dc0eSBarry Smith   {
412987828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41302853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
41312853dc0eSBarry Smith   }
4132aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
41332853dc0eSBarry Smith   {
413487828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41352853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
41362853dc0eSBarry Smith   }
4137aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
41382853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4139e1293385SBarry Smith #else
414030d4dcafSBarry Smith   {
414187828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4142d9fead3dSBarry Smith     const MatScalar *v;
4143356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
4144356650c2SBarry Smith     const PetscInt  *vi;
4145e1293385SBarry Smith 
41464e2b4712SSatish Balay   /* forward solve the lower triangular */
41474e2b4712SSatish Balay   idx    = 0;
4148e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
41494e2b4712SSatish Balay   for (i=1; i<n; i++) {
41504e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
41514e2b4712SSatish Balay     vi    =  aj      + ai[i];
41524e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
4153e1293385SBarry Smith     idx   +=  4;
4154f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
41554e2b4712SSatish Balay     while (nz--) {
41564e2b4712SSatish Balay       jdx   = 4*(*vi++);
41574e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4158f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4159f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4160f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4161f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
41624e2b4712SSatish Balay       v    += 16;
41634e2b4712SSatish Balay     }
4164f1af5d2fSBarry Smith     x[idx]   = s1;
4165f1af5d2fSBarry Smith     x[1+idx] = s2;
4166f1af5d2fSBarry Smith     x[2+idx] = s3;
4167f1af5d2fSBarry Smith     x[3+idx] = s4;
41684e2b4712SSatish Balay   }
41694e2b4712SSatish Balay   /* backward solve the upper triangular */
41704e555682SBarry Smith   idt = 4*(n-1);
41714e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
41724e555682SBarry Smith     ai16 = 16*diag[i];
41734e555682SBarry Smith     v    = aa + ai16 + 16;
41744e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
41754e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4176f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4177f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
41784e2b4712SSatish Balay     while (nz--) {
41794e2b4712SSatish Balay       idx   = 4*(*vi++);
41804e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4181f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4182f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4183f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4184f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
41854e2b4712SSatish Balay       v    += 16;
41864e2b4712SSatish Balay     }
41874e555682SBarry Smith     v        = aa + ai16;
4188f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4189f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4190f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4191f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4192329f5518SBarry Smith     idt -= 4;
41934e2b4712SSatish Balay   }
419430d4dcafSBarry Smith   }
4195e1293385SBarry Smith #endif
41964e2b4712SSatish Balay 
41973649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
41981ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4199dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
42004e2b4712SSatish Balay   PetscFunctionReturn(0);
42014e2b4712SSatish Balay }
42024e2b4712SSatish Balay 
4203b2b2dd24SShri Abhyankar #undef __FUNCT__
42044dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
42054dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4206b2b2dd24SShri Abhyankar {
4207b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4208b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4209b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
4210b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4211b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4212b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4213b2b2dd24SShri Abhyankar     PetscScalar       *x;
4214b2b2dd24SShri Abhyankar     const PetscScalar *b;
4215b2b2dd24SShri Abhyankar     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4216cee9d6f2SShri Abhyankar 
4217b2b2dd24SShri Abhyankar     PetscFunctionBegin;
42183649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4219b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4220b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4221b2b2dd24SShri Abhyankar     idx    = 0;
4222b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4223b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4224b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4225b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4226b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4227b2b2dd24SShri Abhyankar       idx   = bs*i;
4228b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4229b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
4230b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
4231b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4232b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4233b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4234b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4235b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4236b2b2dd24SShri Abhyankar 
4237b2b2dd24SShri Abhyankar           v   +=  bs2;
4238b2b2dd24SShri Abhyankar         }
4239b2b2dd24SShri Abhyankar 
4240b2b2dd24SShri Abhyankar        x[idx]   = s1;
4241b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4242b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4243b2b2dd24SShri Abhyankar        x[3+idx] = s4;
4244b2b2dd24SShri Abhyankar     }
4245b2b2dd24SShri Abhyankar 
4246b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4247b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4248b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4249b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4250b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4251b2b2dd24SShri Abhyankar      idt = bs*i;
4252b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4253b2b2dd24SShri Abhyankar 
4254b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
4255b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
4256b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4257b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4258b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4259b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4260b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4261b2b2dd24SShri Abhyankar 
4262b2b2dd24SShri Abhyankar         v   +=  bs2;
4263b2b2dd24SShri Abhyankar     }
4264b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4265b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4266b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4267b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4268b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4269b2b2dd24SShri Abhyankar 
4270b2b2dd24SShri Abhyankar   }
4271b2b2dd24SShri Abhyankar 
42723649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4273b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4274b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4275b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4276b2b2dd24SShri Abhyankar }
4277cee9d6f2SShri Abhyankar 
4278cee9d6f2SShri Abhyankar #undef __FUNCT__
4279f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4280dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4281f26ec98cSKris Buschelman {
4282f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4283b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4284dfbe8321SBarry Smith   PetscErrorCode    ierr;
4285b3260449SShri Abhyankar   const MatScalar   *aa=a->a;
4286b3260449SShri Abhyankar   const PetscScalar *b;
4287b3260449SShri Abhyankar   PetscScalar       *x;
4288f26ec98cSKris Buschelman 
4289f26ec98cSKris Buschelman   PetscFunctionBegin;
42903649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
42911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4292f26ec98cSKris Buschelman 
4293f26ec98cSKris Buschelman   {
4294f26ec98cSKris Buschelman     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4295b3260449SShri Abhyankar     const MatScalar  *v;
4296b3260449SShri Abhyankar     MatScalar        *t=(MatScalar *)x;
4297b3260449SShri Abhyankar     PetscInt         jdx,idt,idx,nz,i,ai16;
4298b3260449SShri Abhyankar     const PetscInt   *vi;
4299f26ec98cSKris Buschelman 
4300f26ec98cSKris Buschelman     /* forward solve the lower triangular */
4301f26ec98cSKris Buschelman     idx  = 0;
4302f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
4303f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
4304f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
4305f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
4306f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
4307f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
4308f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
4309f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
4310f26ec98cSKris Buschelman       idx   +=  4;
4311f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
4312f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
4313f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
4314f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
4315f26ec98cSKris Buschelman       while (nz--) {
4316f26ec98cSKris Buschelman         jdx = 4*(*vi++);
4317f26ec98cSKris Buschelman         x1  = t[jdx];
4318f26ec98cSKris Buschelman         x2  = t[1+jdx];
4319f26ec98cSKris Buschelman         x3  = t[2+jdx];
4320f26ec98cSKris Buschelman         x4  = t[3+jdx];
4321f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4322f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4323f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4324f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4325f26ec98cSKris Buschelman         v    += 16;
4326f26ec98cSKris Buschelman       }
4327f26ec98cSKris Buschelman       t[idx]   = s1;
4328f26ec98cSKris Buschelman       t[1+idx] = s2;
4329f26ec98cSKris Buschelman       t[2+idx] = s3;
4330f26ec98cSKris Buschelman       t[3+idx] = s4;
4331f26ec98cSKris Buschelman     }
4332f26ec98cSKris Buschelman     /* backward solve the upper triangular */
4333f26ec98cSKris Buschelman     idt = 4*(n-1);
4334f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
4335f26ec98cSKris Buschelman       ai16 = 16*diag[i];
4336f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
4337f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
4338f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
4339f26ec98cSKris Buschelman       s1   = t[idt];
4340f26ec98cSKris Buschelman       s2   = t[1+idt];
4341f26ec98cSKris Buschelman       s3   = t[2+idt];
4342f26ec98cSKris Buschelman       s4   = t[3+idt];
4343f26ec98cSKris Buschelman       while (nz--) {
4344f26ec98cSKris Buschelman         idx = 4*(*vi++);
4345f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
4346f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
4347f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
4348f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
4349f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4350f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4351f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4352f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4353f26ec98cSKris Buschelman         v    += 16;
4354f26ec98cSKris Buschelman       }
4355f26ec98cSKris Buschelman       v        = aa + ai16;
4356f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4357f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4358f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4359f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4360f26ec98cSKris Buschelman       idt -= 4;
4361f26ec98cSKris Buschelman     }
4362f26ec98cSKris Buschelman   }
4363f26ec98cSKris Buschelman 
43643649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
43651ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4366dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4367f26ec98cSKris Buschelman   PetscFunctionReturn(0);
4368f26ec98cSKris Buschelman }
4369f26ec98cSKris Buschelman 
43703660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
43713660e330SKris Buschelman 
43723660e330SKris Buschelman #include PETSC_HAVE_SSE
43733660e330SKris Buschelman #undef __FUNCT__
43747cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4375dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
43763660e330SKris Buschelman {
43773660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
43782aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
4379dfbe8321SBarry Smith   PetscErrorCode ierr;
4380dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
43813660e330SKris Buschelman   MatScalar      *aa=a->a;
438287828ca2SBarry Smith   PetscScalar    *x,*b;
43833660e330SKris Buschelman 
43843660e330SKris Buschelman   PetscFunctionBegin;
43853660e330SKris Buschelman   SSE_SCOPE_BEGIN;
43863660e330SKris Buschelman   /*
43873660e330SKris Buschelman      Note: This code currently uses demotion of double
43883660e330SKris Buschelman      to float when performing the mixed-mode computation.
43893660e330SKris Buschelman      This may not be numerically reasonable for all applications.
43903660e330SKris Buschelman   */
43913660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
43923660e330SKris Buschelman 
43931ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
43941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
43953660e330SKris Buschelman   {
4396eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
4397eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
43982aa5897fSKris Buschelman     int            nz,i,idt,ai16;
43992aa5897fSKris Buschelman     unsigned int   jdx,idx;
44002aa5897fSKris Buschelman     unsigned short *vi;
4401eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
44023660e330SKris Buschelman 
4403eb05f457SKris Buschelman     /* First block is the identity. */
44043660e330SKris Buschelman     idx  = 0;
4405eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
44062aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
44073660e330SKris Buschelman 
44083660e330SKris Buschelman     for (i=1; i<n;) {
44093660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
44103660e330SKris Buschelman       vi   =  aj      + ai[i];
44113660e330SKris Buschelman       nz   =  diag[i] - ai[i];
44123660e330SKris Buschelman       idx +=  4;
44133660e330SKris Buschelman 
4414eb05f457SKris Buschelman       /* Demote RHS from double to float. */
4415eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4416eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
44173660e330SKris Buschelman 
44183660e330SKris Buschelman       while (nz--) {
44193660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
44202aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
44213660e330SKris Buschelman 
44223660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
4423eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
44243660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
44253660e330SKris Buschelman 
44263660e330SKris Buschelman           /* First Column */
44273660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
44283660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
44293660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
44303660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
44313660e330SKris Buschelman 
44323660e330SKris Buschelman           /* Second Column */
44333660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
44343660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
44353660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
44363660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
44373660e330SKris Buschelman 
44383660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
44393660e330SKris Buschelman 
44403660e330SKris Buschelman           /* Third Column */
44413660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
44423660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
44433660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
44443660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
44453660e330SKris Buschelman 
44463660e330SKris Buschelman           /* Fourth Column */
44473660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
44483660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
44493660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
44503660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
44513660e330SKris Buschelman         SSE_INLINE_END_2
44523660e330SKris Buschelman 
44533660e330SKris Buschelman         v  += 16;
44543660e330SKris Buschelman       }
44553660e330SKris Buschelman       v    =  aa + 16*ai[++i];
44563660e330SKris Buschelman       PREFETCH_NTA(v);
4457eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
44583660e330SKris Buschelman     }
4459eb05f457SKris Buschelman 
4460eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
4461eb05f457SKris Buschelman 
44623660e330SKris Buschelman     idt  = 4*(n-1);
44633660e330SKris Buschelman     ai16 = 16*diag[n-1];
44643660e330SKris Buschelman     v    = aa + ai16 + 16;
44653660e330SKris Buschelman     for (i=n-1; i>=0;){
44663660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
44673660e330SKris Buschelman       vi = aj + diag[i] + 1;
44683660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
44693660e330SKris Buschelman 
4470eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
44713660e330SKris Buschelman 
44723660e330SKris Buschelman       while (nz--) {
44733660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
44742aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
44753660e330SKris Buschelman 
44763660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
4477eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
44783660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
44793660e330SKris Buschelman 
44803660e330SKris Buschelman           /* First Column */
44813660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
44823660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
44833660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
44843660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
44853660e330SKris Buschelman 
44863660e330SKris Buschelman           /* Second Column */
44873660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
44883660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
44893660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
44903660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
44913660e330SKris Buschelman 
44923660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
44933660e330SKris Buschelman 
44943660e330SKris Buschelman           /* Third Column */
44953660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
44963660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
44973660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
44983660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
44993660e330SKris Buschelman 
45003660e330SKris Buschelman           /* Fourth Column */
45013660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
45023660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
45033660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
45043660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
45053660e330SKris Buschelman         SSE_INLINE_END_2
45063660e330SKris Buschelman         v  += 16;
45073660e330SKris Buschelman       }
45083660e330SKris Buschelman       v    = aa + ai16;
45093660e330SKris Buschelman       ai16 = 16*diag[--i];
45103660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
45113660e330SKris Buschelman       /*
45123660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
45133660e330SKris Buschelman          which was inverted as part of the factorization
45143660e330SKris Buschelman       */
4515eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
45163660e330SKris Buschelman         /* First Column */
45173660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
45183660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
45193660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
45203660e330SKris Buschelman 
45213660e330SKris Buschelman         /* Second Column */
45223660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
45233660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
45243660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
45253660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
45263660e330SKris Buschelman 
45273660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
45283660e330SKris Buschelman 
45293660e330SKris Buschelman         /* Third Column */
45303660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
45313660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
45323660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
45333660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
45343660e330SKris Buschelman 
45353660e330SKris Buschelman         /* Fourth Column */
45363660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
45373660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
45383660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
45393660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
45403660e330SKris Buschelman 
45413660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
45423660e330SKris Buschelman       SSE_INLINE_END_3
45433660e330SKris Buschelman 
45443660e330SKris Buschelman       v    = aa + ai16 + 16;
45453660e330SKris Buschelman       idt -= 4;
45463660e330SKris Buschelman     }
4547eb05f457SKris Buschelman 
4548eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
4549eb05f457SKris Buschelman     idt = 4*(n-1);
4550eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
4551eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4552eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4553eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
4554eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
4555eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
4556eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
4557eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
4558eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
455954693613SKris Buschelman       idt -= 4;
45603660e330SKris Buschelman     }
4561eb05f457SKris Buschelman 
4562eb05f457SKris Buschelman   } /* End of artificial scope. */
45631ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
45641ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4565dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
45663660e330SKris Buschelman   SSE_SCOPE_END;
45673660e330SKris Buschelman   PetscFunctionReturn(0);
45683660e330SKris Buschelman }
45693660e330SKris Buschelman 
45707cf1b8d3SKris Buschelman #undef __FUNCT__
45717cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4572dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
45737cf1b8d3SKris Buschelman {
45747cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
45757cf1b8d3SKris Buschelman   int            *aj=a->j;
4576dfbe8321SBarry Smith   PetscErrorCode ierr;
4577dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
45787cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
45797cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
45807cf1b8d3SKris Buschelman 
45817cf1b8d3SKris Buschelman   PetscFunctionBegin;
45827cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
45837cf1b8d3SKris Buschelman   /*
45847cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
45857cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
45867cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
45877cf1b8d3SKris Buschelman   */
45887cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
45897cf1b8d3SKris Buschelman 
45901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
45911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
45927cf1b8d3SKris Buschelman   {
45937cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
45947cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
45957cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
45967cf1b8d3SKris Buschelman     int       jdx,idx;
45977cf1b8d3SKris Buschelman     int       *vi;
45987cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
45997cf1b8d3SKris Buschelman 
46007cf1b8d3SKris Buschelman     /* First block is the identity. */
46017cf1b8d3SKris Buschelman     idx  = 0;
46027cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
46037cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
46047cf1b8d3SKris Buschelman 
46057cf1b8d3SKris Buschelman     for (i=1; i<n;) {
46067cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
46077cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
46087cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
46097cf1b8d3SKris Buschelman       idx +=  4;
46107cf1b8d3SKris Buschelman 
46117cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
46127cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
46137cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
46147cf1b8d3SKris Buschelman 
46157cf1b8d3SKris Buschelman       while (nz--) {
46167cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
46177cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
46187cf1b8d3SKris Buschelman /*          jdx = *vi++; */
46197cf1b8d3SKris Buschelman 
46207cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
46217cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
46227cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
46237cf1b8d3SKris Buschelman 
46247cf1b8d3SKris Buschelman           /* First Column */
46257cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
46267cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
46277cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
46287cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
46297cf1b8d3SKris Buschelman 
46307cf1b8d3SKris Buschelman           /* Second Column */
46317cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
46327cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
46337cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
46347cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
46357cf1b8d3SKris Buschelman 
46367cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
46377cf1b8d3SKris Buschelman 
46387cf1b8d3SKris Buschelman           /* Third Column */
46397cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
46407cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
46417cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
46427cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
46437cf1b8d3SKris Buschelman 
46447cf1b8d3SKris Buschelman           /* Fourth Column */
46457cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
46467cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
46477cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
46487cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
46497cf1b8d3SKris Buschelman         SSE_INLINE_END_2
46507cf1b8d3SKris Buschelman 
46517cf1b8d3SKris Buschelman         v  += 16;
46527cf1b8d3SKris Buschelman       }
46537cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
46547cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
46557cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
46567cf1b8d3SKris Buschelman     }
46577cf1b8d3SKris Buschelman 
46587cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
46597cf1b8d3SKris Buschelman 
46607cf1b8d3SKris Buschelman     idt  = 4*(n-1);
46617cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
46627cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
46637cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
46647cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
46657cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
46667cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
46677cf1b8d3SKris Buschelman 
46687cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
46697cf1b8d3SKris Buschelman 
46707cf1b8d3SKris Buschelman       while (nz--) {
46717cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
46727cf1b8d3SKris Buschelman         idx = 4*(*vi++);
46737cf1b8d3SKris Buschelman /*          idx = *vi++; */
46747cf1b8d3SKris Buschelman 
46757cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
46767cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
46777cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
46787cf1b8d3SKris Buschelman 
46797cf1b8d3SKris Buschelman           /* First Column */
46807cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
46817cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
46827cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
46837cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
46847cf1b8d3SKris Buschelman 
46857cf1b8d3SKris Buschelman           /* Second Column */
46867cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
46877cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
46887cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
46897cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
46907cf1b8d3SKris Buschelman 
46917cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
46927cf1b8d3SKris Buschelman 
46937cf1b8d3SKris Buschelman           /* Third Column */
46947cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
46957cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
46967cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
46977cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
46987cf1b8d3SKris Buschelman 
46997cf1b8d3SKris Buschelman           /* Fourth Column */
47007cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
47017cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
47027cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
47037cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
47047cf1b8d3SKris Buschelman         SSE_INLINE_END_2
47057cf1b8d3SKris Buschelman         v  += 16;
47067cf1b8d3SKris Buschelman       }
47077cf1b8d3SKris Buschelman       v    = aa + ai16;
47087cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
47097cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
47107cf1b8d3SKris Buschelman       /*
47117cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
47127cf1b8d3SKris Buschelman          which was inverted as part of the factorization
47137cf1b8d3SKris Buschelman       */
47147cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
47157cf1b8d3SKris Buschelman         /* First Column */
47167cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
47177cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
47187cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
47197cf1b8d3SKris Buschelman 
47207cf1b8d3SKris Buschelman         /* Second Column */
47217cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
47227cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
47237cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
47247cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
47257cf1b8d3SKris Buschelman 
47267cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
47277cf1b8d3SKris Buschelman 
47287cf1b8d3SKris Buschelman         /* Third Column */
47297cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
47307cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
47317cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
47327cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
47337cf1b8d3SKris Buschelman 
47347cf1b8d3SKris Buschelman         /* Fourth Column */
47357cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
47367cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
47377cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
47387cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
47397cf1b8d3SKris Buschelman 
47407cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
47417cf1b8d3SKris Buschelman       SSE_INLINE_END_3
47427cf1b8d3SKris Buschelman 
47437cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
47447cf1b8d3SKris Buschelman       idt -= 4;
47457cf1b8d3SKris Buschelman     }
47467cf1b8d3SKris Buschelman 
47477cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
47487cf1b8d3SKris Buschelman     idt = 4*(n-1);
47497cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
47507cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
47517cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
47527cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
47537cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
47547cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
47557cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
47567cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
47577cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
47587cf1b8d3SKris Buschelman       idt -= 4;
47597cf1b8d3SKris Buschelman     }
47607cf1b8d3SKris Buschelman 
47617cf1b8d3SKris Buschelman   } /* End of artificial scope. */
47621ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
47631ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4764dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
47657cf1b8d3SKris Buschelman   SSE_SCOPE_END;
47667cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
47677cf1b8d3SKris Buschelman }
47687cf1b8d3SKris Buschelman 
47693660e330SKris Buschelman #endif
47708f690400SShri Abhyankar 
47714a2ae208SSatish Balay #undef __FUNCT__
477206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
477306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
47744e2b4712SSatish Balay {
47754e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
47764e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
47776849ba73SBarry Smith   PetscErrorCode    ierr;
4778b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4779b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
47805d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4781d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4782d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4783d9fead3dSBarry Smith   const PetscScalar *b;
47844e2b4712SSatish Balay 
47854e2b4712SSatish Balay   PetscFunctionBegin;
47863649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
47871ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4788f1af5d2fSBarry Smith   t  = a->solve_work;
47894e2b4712SSatish Balay 
47904e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
47914e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
47924e2b4712SSatish Balay 
47934e2b4712SSatish Balay   /* forward solve the lower triangular */
47944e2b4712SSatish Balay   idx    = 3*(*r++);
4795f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
47964e2b4712SSatish Balay   for (i=1; i<n; i++) {
47974e2b4712SSatish Balay     v     = aa + 9*ai[i];
47984e2b4712SSatish Balay     vi    = aj + ai[i];
47994e2b4712SSatish Balay     nz    = diag[i] - ai[i];
48004e2b4712SSatish Balay     idx   = 3*(*r++);
4801f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
48024e2b4712SSatish Balay     while (nz--) {
48034e2b4712SSatish Balay       idx   = 3*(*vi++);
4804f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4805f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4806f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4807f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48084e2b4712SSatish Balay       v += 9;
48094e2b4712SSatish Balay     }
48104e2b4712SSatish Balay     idx = 3*i;
4811f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
48124e2b4712SSatish Balay   }
48134e2b4712SSatish Balay   /* backward solve the upper triangular */
48144e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
48154e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
48164e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
48174e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
48184e2b4712SSatish Balay     idt  = 3*i;
4819f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
48204e2b4712SSatish Balay     while (nz--) {
48214e2b4712SSatish Balay       idx   = 3*(*vi++);
4822f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4823f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4824f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4825f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48264e2b4712SSatish Balay       v += 9;
48274e2b4712SSatish Balay     }
48284e2b4712SSatish Balay     idc = 3*(*c--);
48294e2b4712SSatish Balay     v   = aa + 9*diag[i];
4830f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4831f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4832f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
48334e2b4712SSatish Balay   }
48344e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
48354e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
48363649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
48371ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4838dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
48394e2b4712SSatish Balay   PetscFunctionReturn(0);
48404e2b4712SSatish Balay }
48414e2b4712SSatish Balay 
48420c4413a7SShri Abhyankar #undef __FUNCT__
48434dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3"
48444dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
48450c4413a7SShri Abhyankar {
48460c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
48470c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
48480c4413a7SShri Abhyankar   PetscErrorCode    ierr;
4849b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4850b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
48510c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
48520c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
48530c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
48540c4413a7SShri Abhyankar   const PetscScalar *b;
48550c4413a7SShri Abhyankar 
48560c4413a7SShri Abhyankar   PetscFunctionBegin;
48573649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
48580c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
48590c4413a7SShri Abhyankar   t  = a->solve_work;
48600c4413a7SShri Abhyankar 
48610c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
48620c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
48630c4413a7SShri Abhyankar 
48640c4413a7SShri Abhyankar   /* forward solve the lower triangular */
48650c4413a7SShri Abhyankar   idx    = 3*r[0];
48660c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
48670c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
48680c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
48690c4413a7SShri Abhyankar     vi    = aj + ai[i];
48700c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
48710c4413a7SShri Abhyankar     idx   = 3*r[i];
48720c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
48730c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
48740c4413a7SShri Abhyankar       idx   = 3*vi[m];
48750c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
48760c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
48770c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
48780c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48790c4413a7SShri Abhyankar       v += 9;
48800c4413a7SShri Abhyankar     }
48810c4413a7SShri Abhyankar     idx = 3*i;
48820c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
48830c4413a7SShri Abhyankar   }
48840c4413a7SShri Abhyankar   /* backward solve the upper triangular */
48850c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
48860c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
48870c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
48880c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
48890c4413a7SShri Abhyankar     idt  = 3*i;
48900c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
48910c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
48920c4413a7SShri Abhyankar       idx   = 3*vi[m];
48930c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
48940c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
48950c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
48960c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48970c4413a7SShri Abhyankar       v += 9;
48980c4413a7SShri Abhyankar     }
48990c4413a7SShri Abhyankar     idc = 3*c[i];
49000c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
49010c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
49020c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
49030c4413a7SShri Abhyankar   }
49040c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
49050c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
49063649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
49070c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
49080c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
49090c4413a7SShri Abhyankar   PetscFunctionReturn(0);
49100c4413a7SShri Abhyankar }
49110c4413a7SShri Abhyankar 
491215091d37SBarry Smith /*
491315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
491415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
491515091d37SBarry Smith */
49164a2ae208SSatish Balay #undef __FUNCT__
491706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
491806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
491915091d37SBarry Smith {
492015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
49210b68f018SBarry Smith   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4922dfbe8321SBarry Smith   PetscErrorCode    ierr;
49230b68f018SBarry Smith   const PetscInt    *diag = a->diag,*vi;
4924d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4925d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4926d9fead3dSBarry Smith   const PetscScalar *b;
49270b68f018SBarry Smith   PetscInt          jdx,idt,idx,nz,i;
492815091d37SBarry Smith 
492915091d37SBarry Smith   PetscFunctionBegin;
49303649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
49311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
493215091d37SBarry Smith 
493315091d37SBarry Smith   /* forward solve the lower triangular */
493415091d37SBarry Smith   idx    = 0;
493515091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
493615091d37SBarry Smith   for (i=1; i<n; i++) {
493715091d37SBarry Smith     v     =  aa      + 9*ai[i];
493815091d37SBarry Smith     vi    =  aj      + ai[i];
493915091d37SBarry Smith     nz    =  diag[i] - ai[i];
494015091d37SBarry Smith     idx   +=  3;
4941f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
494215091d37SBarry Smith     while (nz--) {
494315091d37SBarry Smith       jdx   = 3*(*vi++);
494415091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4945f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4946f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4947f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
494815091d37SBarry Smith       v    += 9;
494915091d37SBarry Smith     }
4950f1af5d2fSBarry Smith     x[idx]   = s1;
4951f1af5d2fSBarry Smith     x[1+idx] = s2;
4952f1af5d2fSBarry Smith     x[2+idx] = s3;
495315091d37SBarry Smith   }
495415091d37SBarry Smith   /* backward solve the upper triangular */
495515091d37SBarry Smith   for (i=n-1; i>=0; i--){
495615091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
495715091d37SBarry Smith     vi   = aj + diag[i] + 1;
495815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
495915091d37SBarry Smith     idt  = 3*i;
4960f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4961f1af5d2fSBarry Smith     s3 = x[2+idt];
496215091d37SBarry Smith     while (nz--) {
496315091d37SBarry Smith       idx   = 3*(*vi++);
496415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4965f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4966f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4967f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
496815091d37SBarry Smith       v    += 9;
496915091d37SBarry Smith     }
497015091d37SBarry Smith     v        = aa +  9*diag[i];
4971f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4972f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4973f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
497415091d37SBarry Smith   }
497515091d37SBarry Smith 
49763649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
49771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4978dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
497915091d37SBarry Smith   PetscFunctionReturn(0);
498015091d37SBarry Smith }
498115091d37SBarry Smith 
4982cee9d6f2SShri Abhyankar #undef __FUNCT__
49834dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
49844dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4985b2b2dd24SShri Abhyankar {
4986b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4987b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4988b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4989b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
4990b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4991b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4992b2b2dd24SShri Abhyankar     PetscScalar       *x;
4993b2b2dd24SShri Abhyankar     const PetscScalar *b;
4994b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4995b2b2dd24SShri Abhyankar 
4996b2b2dd24SShri Abhyankar     PetscFunctionBegin;
49973649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4998b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4999b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
5000b2b2dd24SShri Abhyankar     idx    = 0;
5001b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5002b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
5003b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
5004b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
5005b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
5006b2b2dd24SShri Abhyankar       idx   = bs*i;
5007b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5008b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
5009b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
5010b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5011b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5012b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5013b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5014b2b2dd24SShri Abhyankar 
5015b2b2dd24SShri Abhyankar           v   +=  bs2;
5016b2b2dd24SShri Abhyankar         }
5017b2b2dd24SShri Abhyankar 
5018b2b2dd24SShri Abhyankar        x[idx]   = s1;
5019b2b2dd24SShri Abhyankar        x[1+idx] = s2;
5020b2b2dd24SShri Abhyankar        x[2+idx] = s3;
5021b2b2dd24SShri Abhyankar     }
5022b2b2dd24SShri Abhyankar 
5023b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
5024b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
5025b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
5026b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
5027b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
5028b2b2dd24SShri Abhyankar      idt = bs*i;
5029b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5030b2b2dd24SShri Abhyankar 
5031b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
5032b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
5033b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5034b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5035b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5036b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5037b2b2dd24SShri Abhyankar 
5038b2b2dd24SShri Abhyankar         v   +=  bs2;
5039b2b2dd24SShri Abhyankar     }
5040b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5041b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5042b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5043b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5044b2b2dd24SShri Abhyankar 
5045b2b2dd24SShri Abhyankar   }
5046b2b2dd24SShri Abhyankar 
50473649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5048b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5049b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5050b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5051b2b2dd24SShri Abhyankar }
5052b2b2dd24SShri Abhyankar 
5053b2b2dd24SShri Abhyankar #undef __FUNCT__
505406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
505506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
50564e2b4712SSatish Balay {
50574e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
50584e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
50596849ba73SBarry Smith   PetscErrorCode    ierr;
5060b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5061b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
50625d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5063d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5064d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
5065d9fead3dSBarry Smith   const PetscScalar *b;
50664e2b4712SSatish Balay 
50674e2b4712SSatish Balay   PetscFunctionBegin;
50683649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
50691ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5070f1af5d2fSBarry Smith   t  = a->solve_work;
50714e2b4712SSatish Balay 
50724e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
50734e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
50744e2b4712SSatish Balay 
50754e2b4712SSatish Balay   /* forward solve the lower triangular */
50764e2b4712SSatish Balay   idx    = 2*(*r++);
5077f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
50784e2b4712SSatish Balay   for (i=1; i<n; i++) {
50794e2b4712SSatish Balay     v     = aa + 4*ai[i];
50804e2b4712SSatish Balay     vi    = aj + ai[i];
50814e2b4712SSatish Balay     nz    = diag[i] - ai[i];
50824e2b4712SSatish Balay     idx   = 2*(*r++);
5083f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
50844e2b4712SSatish Balay     while (nz--) {
50854e2b4712SSatish Balay       idx   = 2*(*vi++);
5086f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5087f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5088f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
50894e2b4712SSatish Balay       v += 4;
50904e2b4712SSatish Balay     }
50914e2b4712SSatish Balay     idx = 2*i;
5092f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
50934e2b4712SSatish Balay   }
50944e2b4712SSatish Balay   /* backward solve the upper triangular */
50954e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
50964e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
50974e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
50984e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
50994e2b4712SSatish Balay     idt  = 2*i;
5100f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
51014e2b4712SSatish Balay     while (nz--) {
51024e2b4712SSatish Balay       idx   = 2*(*vi++);
5103f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5104f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5105f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
51064e2b4712SSatish Balay       v += 4;
51074e2b4712SSatish Balay     }
51084e2b4712SSatish Balay     idc = 2*(*c--);
51094e2b4712SSatish Balay     v   = aa + 4*diag[i];
5110f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5111f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
51124e2b4712SSatish Balay   }
51134e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
51144e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
51153649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
51161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5117dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
51184e2b4712SSatish Balay   PetscFunctionReturn(0);
51194e2b4712SSatish Balay }
51204e2b4712SSatish Balay 
51210c4413a7SShri Abhyankar #undef __FUNCT__
51224dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2"
51234dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
51240c4413a7SShri Abhyankar {
51250c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
51260c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
51270c4413a7SShri Abhyankar   PetscErrorCode    ierr;
5128b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5129b3260449SShri Abhyankar   PetscInt          i,nz,idx,jdx,idt,idc,m;
51300c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
51310c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
51320c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
51330c4413a7SShri Abhyankar   const PetscScalar *b;
51340c4413a7SShri Abhyankar 
51350c4413a7SShri Abhyankar   PetscFunctionBegin;
51363649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
51370c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
51380c4413a7SShri Abhyankar   t  = a->solve_work;
51390c4413a7SShri Abhyankar 
51400c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
51410c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
51420c4413a7SShri Abhyankar 
51430c4413a7SShri Abhyankar   /* forward solve the lower triangular */
51440c4413a7SShri Abhyankar   idx    = 2*r[0];
51450c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
51460c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
51470c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
51480c4413a7SShri Abhyankar     vi    = aj + ai[i];
51490c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
51500c4413a7SShri Abhyankar     idx   = 2*r[i];
51510c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
51520c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
51530c4413a7SShri Abhyankar       jdx   = 2*vi[m];
51540c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
51550c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
51560c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
51570c4413a7SShri Abhyankar       v += 4;
51580c4413a7SShri Abhyankar     }
51590c4413a7SShri Abhyankar     idx = 2*i;
51600c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
51610c4413a7SShri Abhyankar   }
51620c4413a7SShri Abhyankar   /* backward solve the upper triangular */
51630c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
51640c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
51650c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
51660c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
51670c4413a7SShri Abhyankar     idt  = 2*i;
51680c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
51690c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
51700c4413a7SShri Abhyankar       idx   = 2*vi[m];
51710c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
51720c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
51730c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
51740c4413a7SShri Abhyankar       v += 4;
51750c4413a7SShri Abhyankar     }
51760c4413a7SShri Abhyankar     idc = 2*c[i];
51770c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
51780c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
51790c4413a7SShri Abhyankar   }
51800c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
51810c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
51823649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
51830c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51840c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
51850c4413a7SShri Abhyankar   PetscFunctionReturn(0);
51860c4413a7SShri Abhyankar }
51878f690400SShri Abhyankar 
518815091d37SBarry Smith /*
518915091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
519015091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
519115091d37SBarry Smith */
51924a2ae208SSatish Balay #undef __FUNCT__
519306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
519406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
519515091d37SBarry Smith {
519615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5197b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5198dfbe8321SBarry Smith   PetscErrorCode    ierr;
5199d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5200d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
5201d9fead3dSBarry Smith   const PetscScalar *b;
5202b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
520315091d37SBarry Smith 
520415091d37SBarry Smith   PetscFunctionBegin;
52053649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
52061ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
520715091d37SBarry Smith 
520815091d37SBarry Smith   /* forward solve the lower triangular */
520915091d37SBarry Smith   idx    = 0;
521015091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
521115091d37SBarry Smith   for (i=1; i<n; i++) {
521215091d37SBarry Smith     v     =  aa      + 4*ai[i];
521315091d37SBarry Smith     vi    =  aj      + ai[i];
521415091d37SBarry Smith     nz    =  diag[i] - ai[i];
521515091d37SBarry Smith     idx   +=  2;
5216f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
521715091d37SBarry Smith     while (nz--) {
521815091d37SBarry Smith       jdx   = 2*(*vi++);
521915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
5220f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5221f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
522215091d37SBarry Smith       v    += 4;
522315091d37SBarry Smith     }
5224f1af5d2fSBarry Smith     x[idx]   = s1;
5225f1af5d2fSBarry Smith     x[1+idx] = s2;
522615091d37SBarry Smith   }
522715091d37SBarry Smith   /* backward solve the upper triangular */
522815091d37SBarry Smith   for (i=n-1; i>=0; i--){
522915091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
523015091d37SBarry Smith     vi   = aj + diag[i] + 1;
523115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
523215091d37SBarry Smith     idt  = 2*i;
5233f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
523415091d37SBarry Smith     while (nz--) {
523515091d37SBarry Smith       idx   = 2*(*vi++);
523615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
5237f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5238f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
523915091d37SBarry Smith       v    += 4;
524015091d37SBarry Smith     }
524115091d37SBarry Smith     v        = aa +  4*diag[i];
5242f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
5243f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
524415091d37SBarry Smith   }
524515091d37SBarry Smith 
52463649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
52471ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5248dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
524915091d37SBarry Smith   PetscFunctionReturn(0);
525015091d37SBarry Smith }
525115091d37SBarry Smith 
5252cee9d6f2SShri Abhyankar #undef __FUNCT__
52534dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
52544dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5255b2b2dd24SShri Abhyankar {
5256b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5257b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5258b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,idt,jdx;
5259b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
5260b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
5261b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
5262b2b2dd24SShri Abhyankar     const PetscScalar *b;
5263b2b2dd24SShri Abhyankar 
5264b2b2dd24SShri Abhyankar     PetscFunctionBegin;
52653649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5266b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5267b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
5268b2b2dd24SShri Abhyankar     idx    = 0;
5269b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
5270b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
5271b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
5272b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
5273b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
5274b2b2dd24SShri Abhyankar        idx  = 2*i;
5275b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
52764c0dbd8dSJed Brown        PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
52774c0dbd8dSJed Brown        PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5278b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
5279b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
5280b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
5281b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
5282b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
5283b2b2dd24SShri Abhyankar            v   +=  4;
5284b2b2dd24SShri Abhyankar         }
5285b2b2dd24SShri Abhyankar        x[idx]   = s1;
5286b2b2dd24SShri Abhyankar        x[1+idx] = s2;
5287b2b2dd24SShri Abhyankar     }
5288b2b2dd24SShri Abhyankar 
5289b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
5290b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
5291b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
5292b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
5293b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
5294b2b2dd24SShri Abhyankar      idt = 2*i;
5295b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
52964c0dbd8dSJed Brown      PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
52974c0dbd8dSJed Brown      PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5298b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
5299b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
5300b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
5301b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
5302b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
5303b2b2dd24SShri Abhyankar          v    += 4;
5304b2b2dd24SShri Abhyankar     }
5305b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5306b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
5307b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
5308b2b2dd24SShri Abhyankar   }
5309b2b2dd24SShri Abhyankar 
53103649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5311b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5312b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5313b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5314b2b2dd24SShri Abhyankar }
5315b2b2dd24SShri Abhyankar 
5316b2b2dd24SShri Abhyankar #undef __FUNCT__
531706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
531806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
53194e2b4712SSatish Balay {
53204e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
53214e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
53226849ba73SBarry Smith   PetscErrorCode    ierr;
5323b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5324b3260449SShri Abhyankar   PetscInt          i,nz;
53255d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5326b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5327b3260449SShri Abhyankar   PetscScalar       *x,s1,*t;
5328b3260449SShri Abhyankar   const PetscScalar *b;
53294e2b4712SSatish Balay 
53304e2b4712SSatish Balay   PetscFunctionBegin;
53314e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
53324e2b4712SSatish Balay 
53333649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
53341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5335f1af5d2fSBarry Smith   t  = a->solve_work;
53364e2b4712SSatish Balay 
53374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
53384e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
53394e2b4712SSatish Balay 
53404e2b4712SSatish Balay   /* forward solve the lower triangular */
5341f1af5d2fSBarry Smith   t[0] = b[*r++];
53424e2b4712SSatish Balay   for (i=1; i<n; i++) {
53434e2b4712SSatish Balay     v     = aa + ai[i];
53444e2b4712SSatish Balay     vi    = aj + ai[i];
53454e2b4712SSatish Balay     nz    = diag[i] - ai[i];
5346f1af5d2fSBarry Smith     s1  = b[*r++];
53474e2b4712SSatish Balay     while (nz--) {
5348f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53494e2b4712SSatish Balay     }
5350f1af5d2fSBarry Smith     t[i] = s1;
53514e2b4712SSatish Balay   }
53524e2b4712SSatish Balay   /* backward solve the upper triangular */
53534e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
53544e2b4712SSatish Balay     v    = aa + diag[i] + 1;
53554e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
53564e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5357f1af5d2fSBarry Smith     s1 = t[i];
53584e2b4712SSatish Balay     while (nz--) {
5359f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53604e2b4712SSatish Balay     }
5361f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
53624e2b4712SSatish Balay   }
53634e2b4712SSatish Balay 
53644e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
53654e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
53663649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
53671ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5368dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53694e2b4712SSatish Balay   PetscFunctionReturn(0);
53704e2b4712SSatish Balay }
5371048b5e81SShri Abhyankar 
5372048b5e81SShri Abhyankar #undef __FUNCT__
5373048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5374048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5375048b5e81SShri Abhyankar {
5376048b5e81SShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5377048b5e81SShri Abhyankar   IS                iscol = a->col,isrow = a->row;
5378048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5379048b5e81SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5380048b5e81SShri Abhyankar   const PetscInt    *rout,*cout,*r,*c;
5381048b5e81SShri Abhyankar   PetscScalar       *x,*tmp,sum;
5382048b5e81SShri Abhyankar   const PetscScalar *b;
5383048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5384048b5e81SShri Abhyankar 
5385048b5e81SShri Abhyankar   PetscFunctionBegin;
5386048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5387048b5e81SShri Abhyankar 
53883649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5389048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5390048b5e81SShri Abhyankar   tmp  = a->solve_work;
5391048b5e81SShri Abhyankar 
5392048b5e81SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5393048b5e81SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5394048b5e81SShri Abhyankar 
5395048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5396048b5e81SShri Abhyankar   tmp[0] = b[r[0]];
5397048b5e81SShri Abhyankar   v      = aa;
5398048b5e81SShri Abhyankar   vi     = aj;
5399048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5400048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5401048b5e81SShri Abhyankar     sum = b[r[i]];
5402048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5403048b5e81SShri Abhyankar     tmp[i] = sum;
5404048b5e81SShri Abhyankar     v += nz; vi += nz;
5405048b5e81SShri Abhyankar   }
5406048b5e81SShri Abhyankar 
5407048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5408048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--){
5409048b5e81SShri Abhyankar     v   = aa + adiag[i+1]+1;
5410048b5e81SShri Abhyankar     vi  = aj + adiag[i+1]+1;
5411048b5e81SShri Abhyankar     nz  = adiag[i]-adiag[i+1]-1;
5412048b5e81SShri Abhyankar     sum = tmp[i];
5413048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5414048b5e81SShri Abhyankar     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5415048b5e81SShri Abhyankar   }
5416048b5e81SShri Abhyankar 
5417048b5e81SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5418048b5e81SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
54193649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5420048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5421048b5e81SShri Abhyankar   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5422048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5423048b5e81SShri Abhyankar }
5424048b5e81SShri Abhyankar 
542515091d37SBarry Smith /*
542615091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
542715091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
542815091d37SBarry Smith */
54294a2ae208SSatish Balay #undef __FUNCT__
543006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
543106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
543215091d37SBarry Smith {
543315091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5434b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5435dfbe8321SBarry Smith   PetscErrorCode    ierr;
5436b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5437b3260449SShri Abhyankar   PetscScalar       *x;
5438b3260449SShri Abhyankar   const PetscScalar *b;
543987828ca2SBarry Smith   PetscScalar       s1,x1;
5440b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
544115091d37SBarry Smith 
544215091d37SBarry Smith   PetscFunctionBegin;
54433649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
54441ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544515091d37SBarry Smith 
544615091d37SBarry Smith   /* forward solve the lower triangular */
544715091d37SBarry Smith   idx    = 0;
544815091d37SBarry Smith   x[0]   = b[0];
544915091d37SBarry Smith   for (i=1; i<n; i++) {
545015091d37SBarry Smith     v     =  aa      + ai[i];
545115091d37SBarry Smith     vi    =  aj      + ai[i];
545215091d37SBarry Smith     nz    =  diag[i] - ai[i];
545315091d37SBarry Smith     idx   +=  1;
5454f1af5d2fSBarry Smith     s1  =  b[idx];
545515091d37SBarry Smith     while (nz--) {
545615091d37SBarry Smith       jdx   = *vi++;
545715091d37SBarry Smith       x1    = x[jdx];
5458f1af5d2fSBarry Smith       s1 -= v[0]*x1;
545915091d37SBarry Smith       v    += 1;
546015091d37SBarry Smith     }
5461f1af5d2fSBarry Smith     x[idx]   = s1;
546215091d37SBarry Smith   }
546315091d37SBarry Smith   /* backward solve the upper triangular */
546415091d37SBarry Smith   for (i=n-1; i>=0; i--){
546515091d37SBarry Smith     v    = aa + diag[i] + 1;
546615091d37SBarry Smith     vi   = aj + diag[i] + 1;
546715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
546815091d37SBarry Smith     idt  = i;
5469f1af5d2fSBarry Smith     s1 = x[idt];
547015091d37SBarry Smith     while (nz--) {
547115091d37SBarry Smith       idx   = *vi++;
547215091d37SBarry Smith       x1    = x[idx];
5473f1af5d2fSBarry Smith       s1 -= v[0]*x1;
547415091d37SBarry Smith       v    += 1;
547515091d37SBarry Smith     }
547615091d37SBarry Smith     v        = aa +  diag[i];
5477f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
547815091d37SBarry Smith   }
54793649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
54801ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5481dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
548215091d37SBarry Smith   PetscFunctionReturn(0);
548315091d37SBarry Smith }
54844e2b4712SSatish Balay 
5485048b5e81SShri Abhyankar 
5486048b5e81SShri Abhyankar #undef __FUNCT__
5487048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5488048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5489048b5e81SShri Abhyankar {
5490048b5e81SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5491048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5492048b5e81SShri Abhyankar   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5493048b5e81SShri Abhyankar   PetscScalar       *x,sum;
5494048b5e81SShri Abhyankar   const PetscScalar *b;
5495048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5496048b5e81SShri Abhyankar   PetscInt          i,nz;
5497048b5e81SShri Abhyankar 
5498048b5e81SShri Abhyankar   PetscFunctionBegin;
5499048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5500048b5e81SShri Abhyankar 
55013649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5502048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5503048b5e81SShri Abhyankar 
5504048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5505048b5e81SShri Abhyankar   x[0] = b[0];
5506048b5e81SShri Abhyankar   v    = aa;
5507048b5e81SShri Abhyankar   vi   = aj;
5508048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5509048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5510048b5e81SShri Abhyankar     sum = b[i];
5511048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5512048b5e81SShri Abhyankar     v  += nz;
5513048b5e81SShri Abhyankar     vi += nz;
5514048b5e81SShri Abhyankar     x[i] = sum;
5515048b5e81SShri Abhyankar   }
5516048b5e81SShri Abhyankar 
5517048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5518048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--){
5519048b5e81SShri Abhyankar     v   = aa + adiag[i+1] + 1;
5520048b5e81SShri Abhyankar     vi  = aj + adiag[i+1] + 1;
5521048b5e81SShri Abhyankar     nz = adiag[i] - adiag[i+1]-1;
5522048b5e81SShri Abhyankar     sum = x[i];
5523048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5524048b5e81SShri Abhyankar     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5525048b5e81SShri Abhyankar   }
5526048b5e81SShri Abhyankar 
5527048b5e81SShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
55283649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5529048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5530048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5531048b5e81SShri Abhyankar }
5532048b5e81SShri Abhyankar 
55334e2b4712SSatish Balay /* ----------------------------------------------------------------*/
553409573ac7SBarry Smith extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool );
55356bce7ff8SHong Zhang 
55362b0b2ea7SShri Abhyankar #undef __FUNCT__
553729a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5538766f9fbaSBarry Smith /*
5539766f9fbaSBarry Smith    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5540766f9fbaSBarry Smith */
554129a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
55422b0b2ea7SShri Abhyankar {
55432b0b2ea7SShri Abhyankar   Mat             C=B;
55442b0b2ea7SShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
55452b0b2ea7SShri Abhyankar   PetscErrorCode  ierr;
5546766f9fbaSBarry Smith   PetscInt        i,j,k,ipvt[15];
5547766f9fbaSBarry Smith   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5548766f9fbaSBarry Smith   PetscInt        nz,nzL,row;
5549766f9fbaSBarry Smith   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5550766f9fbaSBarry Smith   const MatScalar *v,*aa=a->a;
55512b0b2ea7SShri Abhyankar   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
55520fa040f9SShri Abhyankar   PetscInt        sol_ver;
55532b0b2ea7SShri Abhyankar 
55542b0b2ea7SShri Abhyankar   PetscFunctionBegin;
55552b0b2ea7SShri Abhyankar 
5556*c55dd799SBarry Smith   ierr = PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
55570fa040f9SShri Abhyankar 
55582b0b2ea7SShri Abhyankar   /* generate work space needed by the factorization */
55592b0b2ea7SShri Abhyankar   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
55602b0b2ea7SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
55612b0b2ea7SShri Abhyankar 
55622b0b2ea7SShri Abhyankar   for (i=0; i<n; i++){
55632b0b2ea7SShri Abhyankar     /* zero rtmp */
55642b0b2ea7SShri Abhyankar     /* L part */
55652b0b2ea7SShri Abhyankar     nz    = bi[i+1] - bi[i];
55662b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
55672b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++){
55682b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
55692b0b2ea7SShri Abhyankar     }
55702b0b2ea7SShri Abhyankar 
55712b0b2ea7SShri Abhyankar     /* U part */
55722b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
55732b0b2ea7SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
55742b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++){
55752b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
55762b0b2ea7SShri Abhyankar     }
55772b0b2ea7SShri Abhyankar 
55782b0b2ea7SShri Abhyankar     /* load in initial (unfactored row) */
557929a97285SShri Abhyankar     nz    = ai[i+1] - ai[i];
558029a97285SShri Abhyankar     ajtmp = aj + ai[i];
558129a97285SShri Abhyankar     v     = aa + bs2*ai[i];
55822b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
558329a97285SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
55842b0b2ea7SShri Abhyankar     }
55852b0b2ea7SShri Abhyankar 
55862b0b2ea7SShri Abhyankar     /* elimination */
55872b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
55882b0b2ea7SShri Abhyankar     nzL   = bi[i+1] - bi[i];
55892b0b2ea7SShri Abhyankar     for(k=0;k < nzL;k++) {
55902b0b2ea7SShri Abhyankar       row = bjtmp[k];
55912b0b2ea7SShri Abhyankar       pc = rtmp + bs2*row;
55922b0b2ea7SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
55932b0b2ea7SShri Abhyankar       if (flg) {
55942b0b2ea7SShri Abhyankar         pv = b->a + bs2*bdiag[row];
5595766f9fbaSBarry Smith 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5596766f9fbaSBarry Smith 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
55972b0b2ea7SShri Abhyankar 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
55982b0b2ea7SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
55992b0b2ea7SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
56002b0b2ea7SShri Abhyankar         for (j=0; j<nz; j++) {
5601766f9fbaSBarry Smith           vv   = rtmp + bs2*pj[j];
5602766f9fbaSBarry Smith           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5603766f9fbaSBarry Smith 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
56042b0b2ea7SShri Abhyankar 	  pv  += bs2;
56052b0b2ea7SShri Abhyankar         }
5606766f9fbaSBarry Smith         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
56072b0b2ea7SShri Abhyankar       }
56082b0b2ea7SShri Abhyankar     }
56092b0b2ea7SShri Abhyankar 
56102b0b2ea7SShri Abhyankar     /* finished row so stick it into b->a */
56112b0b2ea7SShri Abhyankar     /* L part */
56122b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
56132b0b2ea7SShri Abhyankar     pj   = b->j + bi[i] ;
56142b0b2ea7SShri Abhyankar     nz   = bi[i+1] - bi[i];
56152b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
56162b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56172b0b2ea7SShri Abhyankar     }
56182b0b2ea7SShri Abhyankar 
56192b0b2ea7SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
56202b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
56212b0b2ea7SShri Abhyankar     pj   = b->j + bdiag[i];
56222b0b2ea7SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5623766f9fbaSBarry Smith     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5624182b8fbaSHong Zhang     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
56252b0b2ea7SShri Abhyankar 
56262b0b2ea7SShri Abhyankar     /* U part */
56272b0b2ea7SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
56282b0b2ea7SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
56292b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
56302b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++){
56312b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56322b0b2ea7SShri Abhyankar     }
56332b0b2ea7SShri Abhyankar   }
56342b0b2ea7SShri Abhyankar 
56352b0b2ea7SShri Abhyankar   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5636832cc040SShri Abhyankar   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5637766f9fbaSBarry Smith   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
56382b0b2ea7SShri Abhyankar   C->assembled = PETSC_TRUE;
5639766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
56402b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
56412b0b2ea7SShri Abhyankar }
56422b0b2ea7SShri Abhyankar 
56436bce7ff8SHong Zhang #undef __FUNCT__
56444dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
56454dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
56466bce7ff8SHong Zhang {
56476bce7ff8SHong Zhang   Mat            C=B;
56486bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
56496bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
56506bce7ff8SHong Zhang   PetscErrorCode ierr;
56515a586d82SBarry Smith   const PetscInt *r,*ic;
56526bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
56536bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5654b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5655914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5656914a18a2SHong Zhang   MatScalar      *v_work;
5657ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity;
56586bce7ff8SHong Zhang 
56596bce7ff8SHong Zhang   PetscFunctionBegin;
56606bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
56616bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5662ae3d28f0SHong Zhang 
5663fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5664fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
56656bce7ff8SHong Zhang 
5666914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
5667fca92195SBarry Smith   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5668914a18a2SHong Zhang 
56696bce7ff8SHong Zhang   for (i=0; i<n; i++){
56706bce7ff8SHong Zhang     /* zero rtmp */
56716bce7ff8SHong Zhang     /* L part */
56726bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
56736bce7ff8SHong Zhang     bjtmp = bj + bi[i];
5674914a18a2SHong Zhang     for  (j=0; j<nz; j++){
5675914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5676914a18a2SHong Zhang     }
56776bce7ff8SHong Zhang 
56786bce7ff8SHong Zhang     /* U part */
56791a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
56801a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
56811a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
56821a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56831a83e813SShri Abhyankar     }
56841a83e813SShri Abhyankar 
56851a83e813SShri Abhyankar     /* load in initial (unfactored row) */
56861a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
56871a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
56881a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
56891a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
56901a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
56911a83e813SShri Abhyankar     }
56921a83e813SShri Abhyankar 
56931a83e813SShri Abhyankar     /* elimination */
56941a83e813SShri Abhyankar     bjtmp = bj + bi[i];
56951a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
56961a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
56971a83e813SShri Abhyankar       row = bjtmp[k];
56981a83e813SShri Abhyankar       pc = rtmp + bs2*row;
56991a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
57001a83e813SShri Abhyankar       if (flg) {
57011a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
57021a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
57031a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
57041a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
57051a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
57061a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
57071a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
57081a83e813SShri Abhyankar         }
57091a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
57101a83e813SShri Abhyankar       }
57111a83e813SShri Abhyankar     }
57121a83e813SShri Abhyankar 
57131a83e813SShri Abhyankar     /* finished row so stick it into b->a */
57141a83e813SShri Abhyankar     /* L part */
57151a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
57161a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
57171a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
57181a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
57191a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57201a83e813SShri Abhyankar     }
57211a83e813SShri Abhyankar 
57221a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
57231a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
57241a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
5725e32f2f54SBarry Smith     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
57261a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57271a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
57281a83e813SShri Abhyankar 
57291a83e813SShri Abhyankar     /* U part */
57301a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
57311a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
57321a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
57331a83e813SShri Abhyankar     for (j=0; j<nz; j++){
57341a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57351a83e813SShri Abhyankar     }
57361a83e813SShri Abhyankar   }
57371a83e813SShri Abhyankar 
57381a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5739fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
57401a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
57411a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
57421a83e813SShri Abhyankar 
5743ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5744ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5745ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
5746ae3d28f0SHong Zhang   if (both_identity){
57474dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5748ae3d28f0SHong Zhang   } else {
57494dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N;
5750ae3d28f0SHong Zhang   }
57514dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5752ae3d28f0SHong Zhang 
57531a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
5754766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
57551a83e813SShri Abhyankar   PetscFunctionReturn(0);
57561a83e813SShri Abhyankar }
57571a83e813SShri Abhyankar 
57586bce7ff8SHong Zhang /*
57596bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
57604dd39f65SShri Abhyankar    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
57614dd39f65SShri Abhyankar    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
57626bce7ff8SHong Zhang */
5763c0c7eb62SShri Abhyankar 
57646bce7ff8SHong Zhang #undef __FUNCT__
57654dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
57664dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
57676bce7ff8SHong Zhang {
57686bce7ff8SHong Zhang 
57696bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
57706bce7ff8SHong Zhang   PetscErrorCode     ierr;
577116a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
577235aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
577335aa4fcfSShri Abhyankar 
577435aa4fcfSShri Abhyankar   PetscFunctionBegin;
577535aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
577635aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
577735aa4fcfSShri Abhyankar 
577835aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
577935aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
578035aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
578135aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_TRUE;
578235aa4fcfSShri Abhyankar   if (!b->diag){
578335aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
578435aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
578535aa4fcfSShri Abhyankar   }
578635aa4fcfSShri Abhyankar   bdiag = b->diag;
578735aa4fcfSShri Abhyankar 
578835aa4fcfSShri Abhyankar   if (n > 0) {
578935aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
579035aa4fcfSShri Abhyankar   }
579135aa4fcfSShri Abhyankar 
579235aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
579335aa4fcfSShri Abhyankar   bi = b->i;
579435aa4fcfSShri Abhyankar   bj = b->j;
579535aa4fcfSShri Abhyankar 
579635aa4fcfSShri Abhyankar   /* L part */
579735aa4fcfSShri Abhyankar   bi[0] = 0;
579835aa4fcfSShri Abhyankar   for (i=0; i<n; i++){
579935aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
580035aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
580135aa4fcfSShri Abhyankar     aj = a->j + ai[i];
580235aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
580335aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
580435aa4fcfSShri Abhyankar     }
580535aa4fcfSShri Abhyankar   }
580635aa4fcfSShri Abhyankar 
580735aa4fcfSShri Abhyankar   /* U part */
580835aa4fcfSShri Abhyankar   bi_temp = bi[n];
580935aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
581035aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
581135aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
581235aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
581335aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
581435aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
581535aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
581635aa4fcfSShri Abhyankar     }
581735aa4fcfSShri Abhyankar     /* diag[i] */
581835aa4fcfSShri Abhyankar     *bj = i; bj++;
581935aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
582035aa4fcfSShri Abhyankar   }
582135aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
582235aa4fcfSShri Abhyankar }
582335aa4fcfSShri Abhyankar 
582435aa4fcfSShri Abhyankar #undef __FUNCT__
58254dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
58264dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
582716a2bf60SHong Zhang {
582816a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
582916a2bf60SHong Zhang   IS                 isicol;
583016a2bf60SHong Zhang   PetscErrorCode     ierr;
583116a2bf60SHong Zhang   const PetscInt     *r,*ic;
58327fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
583316a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
583416a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
583516a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
5836ace3abfcSBarry Smith   PetscBool          col_identity,row_identity,both_identity;
583716a2bf60SHong Zhang   PetscReal          f;
583816a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
583916a2bf60SHong Zhang   PetscBT            lnkbt;
584016a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
584116a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
584216a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5843ace3abfcSBarry Smith   PetscBool          missing;
58447fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
584516a2bf60SHong Zhang 
584616a2bf60SHong Zhang   PetscFunctionBegin;
5847e32f2f54SBarry Smith   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
58486ba06ab7SHong Zhang   if (bs>1){  /* check shifttype */
58496ba06ab7SHong Zhang     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
58506ba06ab7SHong Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
58516ba06ab7SHong Zhang   }
58526ba06ab7SHong Zhang 
585316a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5854e32f2f54SBarry Smith   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
585516a2bf60SHong Zhang 
585616a2bf60SHong Zhang   f             = info->fill;
585716a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
585816a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
585916a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
586016a2bf60SHong Zhang 
586116a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
586216a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5863ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
586416a2bf60SHong Zhang 
58657fa3a6a0SHong Zhang   if (!levels && both_identity) {
586616a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
58674dd39f65SShri Abhyankar     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
58684dd39f65SShri Abhyankar     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
586935aa4fcfSShri Abhyankar 
5870d5f3da31SBarry Smith     fact->factortype               = MAT_FACTOR_ILU;
587135aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
587235aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
587335aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
587435aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
587535aa4fcfSShri Abhyankar     b->row           = isrow;
587635aa4fcfSShri Abhyankar     b->col           = iscol;
587735aa4fcfSShri Abhyankar     b->icol          = isicol;
587835aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
587935aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
588035aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
588135aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
588235aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
588335aa4fcfSShri Abhyankar   }
588435aa4fcfSShri Abhyankar 
588535aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
588635aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
588735aa4fcfSShri Abhyankar 
588835aa4fcfSShri Abhyankar   /* get new row pointers */
588935aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
589035aa4fcfSShri Abhyankar   bi[0] = 0;
589135aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
589235aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
589335aa4fcfSShri Abhyankar   bdiag[0]  = 0;
589435aa4fcfSShri Abhyankar 
5895fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
589635aa4fcfSShri Abhyankar 
589735aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
589835aa4fcfSShri Abhyankar   nlnk = n + 1;
589935aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
590035aa4fcfSShri Abhyankar 
590135aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
590235aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
590335aa4fcfSShri Abhyankar   current_space = free_space;
590435aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
590535aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
590635aa4fcfSShri Abhyankar 
590735aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
590835aa4fcfSShri Abhyankar     nzi = 0;
590935aa4fcfSShri Abhyankar     /* copy current row into linked list */
591035aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
5911e32f2f54SBarry Smith     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
591235aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
591335aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
591435aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
591535aa4fcfSShri Abhyankar     nzi += nlnk;
591635aa4fcfSShri Abhyankar 
591735aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
591835aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
591935aa4fcfSShri Abhyankar       fm = n;
592035aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
592135aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
592235aa4fcfSShri Abhyankar       lnk[fm]    = i;
592335aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
592435aa4fcfSShri Abhyankar       nzi++; dcount++;
592535aa4fcfSShri Abhyankar     }
592635aa4fcfSShri Abhyankar 
592735aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
592835aa4fcfSShri Abhyankar     nzbd = 0;
592935aa4fcfSShri Abhyankar     prow = lnk[n];
593035aa4fcfSShri Abhyankar     while (prow < i) {
593135aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
593235aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
593335aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
593435aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
593535aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
593635aa4fcfSShri Abhyankar       nzi += nlnk;
593735aa4fcfSShri Abhyankar       prow = lnk[prow];
593835aa4fcfSShri Abhyankar       nzbd++;
593935aa4fcfSShri Abhyankar     }
594035aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
594135aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
594235aa4fcfSShri Abhyankar 
594335aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
594435aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
594535aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
594635aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
594735aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
594835aa4fcfSShri Abhyankar       reallocs++;
594935aa4fcfSShri Abhyankar     }
595035aa4fcfSShri Abhyankar 
595135aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
595235aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
595335aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
595435aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
595535aa4fcfSShri Abhyankar 
595635aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
595765e19b50SBarry Smith     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
595835aa4fcfSShri Abhyankar 
595935aa4fcfSShri Abhyankar     current_space->array           += nzi;
596035aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
596135aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
596235aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
596335aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
596435aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
596535aa4fcfSShri Abhyankar   }
596635aa4fcfSShri Abhyankar 
596735aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
596835aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
596935aa4fcfSShri Abhyankar 
597035aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
59719263d837SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
59722ce24eb6SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
597335aa4fcfSShri Abhyankar 
597435aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
597535aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5976fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
597735aa4fcfSShri Abhyankar 
597835aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
597935aa4fcfSShri Abhyankar   {
5980aef85c9fSShri Abhyankar     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
598135aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
598235aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
598335aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
598435aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
598535aa4fcfSShri Abhyankar     if (diagonal_fill) {
598635aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
598735aa4fcfSShri Abhyankar     }
598835aa4fcfSShri Abhyankar   }
598935aa4fcfSShri Abhyankar #endif
599035aa4fcfSShri Abhyankar 
599135aa4fcfSShri Abhyankar   /* put together the new matrix */
599235aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
599335aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
599435aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
599535aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
599635aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
599735aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
599835aa4fcfSShri Abhyankar   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
599935aa4fcfSShri Abhyankar   b->j          = bj;
600035aa4fcfSShri Abhyankar   b->i          = bi;
600135aa4fcfSShri Abhyankar   b->diag       = bdiag;
600235aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
600335aa4fcfSShri Abhyankar   b->ilen       = 0;
600435aa4fcfSShri Abhyankar   b->imax       = 0;
600535aa4fcfSShri Abhyankar   b->row        = isrow;
600635aa4fcfSShri Abhyankar   b->col        = iscol;
600735aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
600835aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
600935aa4fcfSShri Abhyankar   b->icol       = isicol;
601035aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
601135aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
601235aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
601335aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
601435aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
6015ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
6016ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
6017ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
60184dd39f65SShri Abhyankar   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
601935aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
602035aa4fcfSShri Abhyankar }
602135aa4fcfSShri Abhyankar 
60224e2b4712SSatish Balay /*
60234e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
60244e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
60254e2b4712SSatish Balay    Not a good example of code reuse.
60264e2b4712SSatish Balay */
60274a2ae208SSatish Balay #undef __FUNCT__
602806e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
602906e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
60304e2b4712SSatish Balay {
60314e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
60324e2b4712SSatish Balay   IS             isicol;
60336849ba73SBarry Smith   PetscErrorCode ierr;
60345d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
60355d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6036a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6037d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6038ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity,flg;
6039329f5518SBarry Smith   PetscReal      f;
60404e2b4712SSatish Balay 
60414e2b4712SSatish Balay   PetscFunctionBegin;
60426bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6043e32f2f54SBarry Smith   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
60446bce7ff8SHong Zhang 
6045435faa5fSBarry Smith   f             = info->fill;
6046690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
6047690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
60484c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
604916a2bf60SHong Zhang 
6050667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6051667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6052ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
6053309c388cSBarry Smith 
605441df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
605516a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
60568b1456e3SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
60576bce7ff8SHong Zhang 
6058d5f3da31SBarry Smith     fact->factortype = MAT_FACTOR_ILU;
6059ae3d28f0SHong Zhang     b            = (Mat_SeqBAIJ*)fact->data;
6060bb3d539aSBarry Smith     b->row       = isrow;
6061bb3d539aSBarry Smith     b->col       = iscol;
6062bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6063bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6064bb3d539aSBarry Smith     b->icol      = isicol;
6065bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6066b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
60676bce7ff8SHong Zhang     PetscFunctionReturn(0);
60686bce7ff8SHong Zhang   }
60696bce7ff8SHong Zhang 
60706bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
60714e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
60724e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
60734e2b4712SSatish Balay 
60744e2b4712SSatish Balay     /* get new row pointers */
6075690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
60764e2b4712SSatish Balay     ainew[0] = 0;
60774e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
6078690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
6079690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
60804e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
6081690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
60824e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
6083690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
60844e2b4712SSatish Balay     /* im is level for each filled value */
6085690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
60864e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
6087690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
60884e2b4712SSatish Balay     dloc[0]  = 0;
60894e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
6090435faa5fSBarry Smith 
6091435faa5fSBarry Smith       /* copy prow into linked list */
60924e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6093e32f2f54SBarry Smith       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
60944e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
60954e2b4712SSatish Balay       fill[n]    = n;
6096435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
60974e2b4712SSatish Balay       while (nz--) {
60984e2b4712SSatish Balay 	fm  = n;
60994e2b4712SSatish Balay 	idx = ic[*xi++];
61004e2b4712SSatish Balay 	do {
61014e2b4712SSatish Balay 	  m  = fm;
61024e2b4712SSatish Balay 	  fm = fill[m];
61034e2b4712SSatish Balay 	} while (fm < idx);
61044e2b4712SSatish Balay 	fill[m]   = idx;
61054e2b4712SSatish Balay 	fill[idx] = fm;
61064e2b4712SSatish Balay 	im[idx]   = 0;
61074e2b4712SSatish Balay       }
6108435faa5fSBarry Smith 
6109435faa5fSBarry Smith       /* make sure diagonal entry is included */
6110435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
6111435faa5fSBarry Smith 	fm = n;
6112435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
6113435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
6114435faa5fSBarry Smith 	fill[fm]   = prow;
6115435faa5fSBarry Smith 	im[prow]   = 0;
6116435faa5fSBarry Smith 	nzf++;
6117335d9088SBarry Smith 	dcount++;
6118435faa5fSBarry Smith       }
6119435faa5fSBarry Smith 
61204e2b4712SSatish Balay       nzi = 0;
61214e2b4712SSatish Balay       row = fill[n];
61224e2b4712SSatish Balay       while (row < prow) {
61234e2b4712SSatish Balay 	incrlev = im[row] + 1;
61244e2b4712SSatish Balay 	nz      = dloc[row];
6125435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
61264e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
61274e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
61284e2b4712SSatish Balay 	fm      = row;
61294e2b4712SSatish Balay 	while (nnz-- > 0) {
61304e2b4712SSatish Balay 	  idx = *xi++;
61314e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
61324e2b4712SSatish Balay 	    flev++;
61334e2b4712SSatish Balay 	    continue;
61344e2b4712SSatish Balay 	  }
61354e2b4712SSatish Balay 	  do {
61364e2b4712SSatish Balay 	    m  = fm;
61374e2b4712SSatish Balay 	    fm = fill[m];
61384e2b4712SSatish Balay 	  } while (fm < idx);
61394e2b4712SSatish Balay 	  if (fm != idx) {
61404e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
61414e2b4712SSatish Balay 	    fill[m]   = idx;
61424e2b4712SSatish Balay 	    fill[idx] = fm;
61434e2b4712SSatish Balay 	    fm        = idx;
61444e2b4712SSatish Balay 	    nzf++;
6145ecf371e4SBarry Smith 	  } else {
61464e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
61474e2b4712SSatish Balay 	  }
61484e2b4712SSatish Balay 	  flev++;
61494e2b4712SSatish Balay 	}
61504e2b4712SSatish Balay 	row = fill[row];
61514e2b4712SSatish Balay 	nzi++;
61524e2b4712SSatish Balay       }
61534e2b4712SSatish Balay       /* copy new filled row into permanent storage */
61544e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
61554e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
6156ecf371e4SBarry Smith 
6157ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
6158ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6159ecf371e4SBarry Smith 	/* just double the memory each time */
6160690b6cddSBarry Smith 	PetscInt maxadd = jmax;
6161ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
61624e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
61634e2b4712SSatish Balay 	jmax += maxadd;
6164ecf371e4SBarry Smith 
6165ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
61665d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
61675d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6168606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
61695d0c19d7SBarry Smith 	ajnew = xitmp;
61705d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
61715d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6172606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
61735d0c19d7SBarry Smith 	ajfill = xitmp;
6174eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
61754e2b4712SSatish Balay       }
61765d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
61774e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
61784e2b4712SSatish Balay       dloc[prow]  = nzi;
61794e2b4712SSatish Balay       fm          = fill[n];
61804e2b4712SSatish Balay       while (nzf--) {
61815d0c19d7SBarry Smith 	*xitmp++ = fm;
61824e2b4712SSatish Balay 	*flev++ = im[fm];
61834e2b4712SSatish Balay 	fm      = fill[fm];
61844e2b4712SSatish Balay       }
6185435faa5fSBarry Smith       /* make sure row has diagonal entry */
6186435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6187e32f2f54SBarry Smith 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
61882401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6189435faa5fSBarry Smith       }
61904e2b4712SSatish Balay     }
6191606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
61924e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
61934e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6194606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
6195606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
61964e2b4712SSatish Balay 
61976cf91177SBarry Smith #if defined(PETSC_USE_INFO)
61984e2b4712SSatish Balay     {
6199329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6200ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6201ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6202ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6203ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6204335d9088SBarry Smith       if (diagonal_fill) {
6205ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6206335d9088SBarry Smith       }
62074e2b4712SSatish Balay     }
620863ba0a88SBarry Smith #endif
62094e2b4712SSatish Balay 
62104e2b4712SSatish Balay     /* put together the new matrix */
6211719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6212719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6213ae3d28f0SHong Zhang     b    = (Mat_SeqBAIJ*)fact->data;
6214e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
6215e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
62167c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
6217a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
62184e2b4712SSatish Balay     b->j          = ajnew;
62194e2b4712SSatish Balay     b->i          = ainew;
62204e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
62214e2b4712SSatish Balay     b->diag       = dloc;
62227f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
62234e2b4712SSatish Balay     b->ilen       = 0;
62244e2b4712SSatish Balay     b->imax       = 0;
62254e2b4712SSatish Balay     b->row        = isrow;
62264e2b4712SSatish Balay     b->col        = iscol;
6227bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6228c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6229c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6230e51c0b9cSSatish Balay     b->icol       = isicol;
623187828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
62324e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
62334e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
6234719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
62354e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
62364e2b4712SSatish Balay 
6237ae3d28f0SHong Zhang     fact->info.factor_mallocs    = reallocate;
6238ae3d28f0SHong Zhang     fact->info.fill_ratio_given  = f;
6239ae3d28f0SHong Zhang     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
62406bce7ff8SHong Zhang 
62418b1456e3SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
62428661488fSKris Buschelman   PetscFunctionReturn(0);
62438661488fSKris Buschelman }
62448661488fSKris Buschelman 
6245732ee342SKris Buschelman #undef __FUNCT__
62467e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6247dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
62487e7071cdSKris Buschelman {
624912272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
625012272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
62515a9542e3SKris Buschelman   PetscFunctionBegin;
62527cf1b8d3SKris Buschelman   /* Undo Column scaling */
62537cf1b8d3SKris Buschelman /*    while (nz--) { */
62547cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
62557cf1b8d3SKris Buschelman /*    } */
6256c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
6257c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
62587cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
62597cf1b8d3SKris Buschelman }
62607cf1b8d3SKris Buschelman 
62617cf1b8d3SKris Buschelman #undef __FUNCT__
62627cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6263dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
62647cf1b8d3SKris Buschelman {
62657cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6266b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
62672aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
62685a9542e3SKris Buschelman   PetscFunctionBegin;
62690b9da03eSKris Buschelman   /* Is this really necessary? */
627020235379SKris Buschelman   while (nz--) {
62710b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
62727e7071cdSKris Buschelman   }
6273c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
62747e7071cdSKris Buschelman   PetscFunctionReturn(0);
62757e7071cdSKris Buschelman }
62767e7071cdSKris Buschelman 
6277732ee342SKris Buschelman 
6278