xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 26fbe8dc1a3c99fb8dddfa572c8c6b3b4ce3ca53)
1be1d678aSKris Buschelman 
24e2b4712SSatish Balay /*
34e2b4712SSatish Balay     Factorization code for BAIJ format.
44e2b4712SSatish Balay */
54e2b4712SSatish Balay 
6c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
7c6db04a5SJed Brown #include <../src/mat/blockinvert.h>
8c6db04a5SJed Brown #include <petscbt.h>
9c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
1293fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
1393fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1493fd935bSShri Abhyankar {
1593fd935bSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1693fd935bSShri Abhyankar   PetscErrorCode    ierr;
1793fd935bSShri Abhyankar   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
1893fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
1993fd935bSShri Abhyankar   PetscInt          nz;
2093fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
2193fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
2293fd935bSShri Abhyankar   const PetscScalar *b;
2393fd935bSShri Abhyankar 
2493fd935bSShri Abhyankar   PetscFunctionBegin;
253649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2693fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2793fd935bSShri Abhyankar   tmp  = a->solve_work;
2893fd935bSShri Abhyankar 
2993fd935bSShri Abhyankar 
3093fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
3193fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[i];
3293fd935bSShri Abhyankar 
3393fd935bSShri Abhyankar   /* forward solve the U^T */
3493fd935bSShri Abhyankar   for (i=0; i<n; i++) {
3593fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
3693fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
3793fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
3893fd935bSShri Abhyankar     s1  = tmp[i];
3993fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
4093fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
4193fd935bSShri Abhyankar     tmp[i] = s1;
4293fd935bSShri Abhyankar   }
4393fd935bSShri Abhyankar 
4493fd935bSShri Abhyankar   /* backward solve the L^T */
4593fd935bSShri Abhyankar   for (i=n-1; i>=0; i--) {
4693fd935bSShri Abhyankar     v  = aa + ai[i];
4793fd935bSShri Abhyankar     vi = aj + ai[i];
4893fd935bSShri Abhyankar     nz = ai[i+1] - ai[i];
4993fd935bSShri Abhyankar     s1 = tmp[i];
5093fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
5193fd935bSShri Abhyankar   }
5293fd935bSShri Abhyankar 
5393fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
5493fd935bSShri Abhyankar   for (i=0; i<n; i++) x[i] = tmp[i];
5593fd935bSShri Abhyankar 
563649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5793fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5893fd935bSShri Abhyankar 
5993fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
6093fd935bSShri Abhyankar   PetscFunctionReturn(0);
6193fd935bSShri Abhyankar }
6293fd935bSShri Abhyankar 
6393fd935bSShri Abhyankar #undef __FUNCT__
6406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
6506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66f1af5d2fSBarry Smith {
67f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
68dfbe8321SBarry Smith   PetscErrorCode  ierr;
690b68f018SBarry Smith   PetscInt        i,nz;
700b68f018SBarry Smith   const PetscInt  *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
710b68f018SBarry Smith   const MatScalar *aa   =a->a,*v;
720b68f018SBarry Smith   PetscScalar     s1,*x;
73f1af5d2fSBarry Smith 
74f1af5d2fSBarry Smith   PetscFunctionBegin;
75ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
77f1af5d2fSBarry Smith 
78f1af5d2fSBarry Smith   /* forward solve the U^T */
79f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
80f1af5d2fSBarry Smith 
81f1af5d2fSBarry Smith     v = aa + diag[i];
82f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
83ef66eb69SBarry Smith     s1 = (*v++)*x[i];
84f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
85f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
86f1af5d2fSBarry Smith     while (nz--) {
87f1af5d2fSBarry Smith       x[*vi++] -= (*v++)*s1;
88f1af5d2fSBarry Smith     }
89f1af5d2fSBarry Smith     x[i] = s1;
90f1af5d2fSBarry Smith   }
91f1af5d2fSBarry Smith   /* backward solve the L^T */
92f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
93f1af5d2fSBarry Smith     v  = aa + diag[i] - 1;
94f1af5d2fSBarry Smith     vi = aj + diag[i] - 1;
95f1af5d2fSBarry Smith     nz = diag[i] - ai[i];
96f1af5d2fSBarry Smith     s1 = x[i];
97f1af5d2fSBarry Smith     while (nz--) {
98f1af5d2fSBarry Smith       x[*vi--] -=  (*v--)*s1;
99f1af5d2fSBarry Smith     }
100f1af5d2fSBarry Smith   }
1011ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
102dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
103f1af5d2fSBarry Smith   PetscFunctionReturn(0);
104f1af5d2fSBarry Smith }
105f1af5d2fSBarry Smith 
1064a2ae208SSatish Balay #undef __FUNCT__
10706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
10806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109f1af5d2fSBarry Smith {
110f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
111dfbe8321SBarry Smith   PetscErrorCode  ierr;
112b3260449SShri Abhyankar   PetscInt        i,nz,idx,idt,oidx;
113b3260449SShri Abhyankar   const PetscInt  *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114b3260449SShri Abhyankar   const MatScalar *aa   =a->a,*v;
115b3260449SShri Abhyankar   PetscScalar     s1,s2,x1,x2,*x;
116f1af5d2fSBarry Smith 
117f1af5d2fSBarry Smith   PetscFunctionBegin;
118ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1191ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
120f1af5d2fSBarry Smith 
121f1af5d2fSBarry Smith   /* forward solve the U^T */
122f1af5d2fSBarry Smith   idx = 0;
123f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
124f1af5d2fSBarry Smith 
125f1af5d2fSBarry Smith     v = aa + 4*diag[i];
126f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
127ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
128f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
129f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
130f1af5d2fSBarry Smith     v += 4;
131f1af5d2fSBarry Smith 
132f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
133f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
134f1af5d2fSBarry Smith     while (nz--) {
135f1af5d2fSBarry Smith       oidx       = 2*(*vi++);
136f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
137f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
138f1af5d2fSBarry Smith       v         += 4;
139f1af5d2fSBarry Smith     }
140f1af5d2fSBarry Smith     x[idx] = s1;x[1+idx] = s2;
141f1af5d2fSBarry Smith     idx   += 2;
142f1af5d2fSBarry Smith   }
143f1af5d2fSBarry Smith   /* backward solve the L^T */
144f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
145f1af5d2fSBarry Smith     v   = aa + 4*diag[i] - 4;
146f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
147f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
148f1af5d2fSBarry Smith     idt = 2*i;
149f1af5d2fSBarry Smith     s1  = x[idt];  s2 = x[1+idt];
150f1af5d2fSBarry Smith     while (nz--) {
151f1af5d2fSBarry Smith       idx       = 2*(*vi--);
152f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
153f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
154f1af5d2fSBarry Smith       v        -= 4;
155f1af5d2fSBarry Smith     }
156f1af5d2fSBarry Smith   }
1571ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
158dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
159f1af5d2fSBarry Smith   PetscFunctionReturn(0);
160f1af5d2fSBarry Smith }
161f1af5d2fSBarry Smith 
1624a2ae208SSatish Balay #undef __FUNCT__
1634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
1644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1656929473cSShri Abhyankar {
1666929473cSShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
1676929473cSShri Abhyankar   PetscErrorCode  ierr;
168b3260449SShri Abhyankar   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1696929473cSShri Abhyankar   PetscInt        nz,idx,idt,j,i,oidx;
170b3260449SShri Abhyankar   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
171b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
172b3260449SShri Abhyankar   PetscScalar     s1,s2,x1,x2,*x;
1736929473cSShri Abhyankar 
1746929473cSShri Abhyankar   PetscFunctionBegin;
1756929473cSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1766929473cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1776929473cSShri Abhyankar 
1786929473cSShri Abhyankar   /* forward solve the U^T */
1796929473cSShri Abhyankar   idx = 0;
1806929473cSShri Abhyankar   for (i=0; i<n; i++) {
1816929473cSShri Abhyankar     v = aa + bs2*diag[i];
1826929473cSShri Abhyankar     /* multiply by the inverse of the block diagonal */
1836929473cSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];
1846929473cSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
1856929473cSShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
1866929473cSShri Abhyankar     v -= bs2;
1876929473cSShri Abhyankar 
1886929473cSShri Abhyankar     vi = aj + diag[i] - 1;
1896929473cSShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
1906929473cSShri Abhyankar     for (j=0; j>-nz; j--) {
1916929473cSShri Abhyankar       oidx       = bs*vi[j];
1926929473cSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2;
1936929473cSShri Abhyankar       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
1946929473cSShri Abhyankar       v         -= bs2;
1956929473cSShri Abhyankar     }
1966929473cSShri Abhyankar     x[idx] = s1;x[1+idx] = s2;
1976929473cSShri Abhyankar     idx   += bs;
1986929473cSShri Abhyankar   }
1996929473cSShri Abhyankar   /* backward solve the L^T */
2006929473cSShri Abhyankar   for (i=n-1; i>=0; i--) {
2016929473cSShri Abhyankar     v   = aa + bs2*ai[i];
2026929473cSShri Abhyankar     vi  = aj + ai[i];
2036929473cSShri Abhyankar     nz  = ai[i+1] - ai[i];
2046929473cSShri Abhyankar     idt = bs*i;
2056929473cSShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];
2066929473cSShri Abhyankar     for (j=0; j<nz; j++) {
2076929473cSShri Abhyankar       idx       = bs*vi[j];
2086929473cSShri Abhyankar       x[idx]   -=  v[0]*s1 +  v[1]*s2;
2096929473cSShri Abhyankar       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
2106929473cSShri Abhyankar       v        += bs2;
2116929473cSShri Abhyankar     }
2126929473cSShri Abhyankar   }
2136929473cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2146929473cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2156929473cSShri Abhyankar   PetscFunctionReturn(0);
2166929473cSShri Abhyankar }
2176929473cSShri Abhyankar 
2186929473cSShri Abhyankar #undef __FUNCT__
21906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
22006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221f1af5d2fSBarry Smith {
222f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
223dfbe8321SBarry Smith   PetscErrorCode  ierr;
224b3260449SShri Abhyankar   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225b3260449SShri Abhyankar   PetscInt        i,nz,idx,idt,oidx;
226b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
227b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,x1,x2,x3,*x;
228f1af5d2fSBarry Smith 
229f1af5d2fSBarry Smith   PetscFunctionBegin;
230ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
232f1af5d2fSBarry Smith 
233f1af5d2fSBarry Smith   /* forward solve the U^T */
234f1af5d2fSBarry Smith   idx = 0;
235f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
236f1af5d2fSBarry Smith 
237f1af5d2fSBarry Smith     v = aa + 9*diag[i];
238f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
239ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
240f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
241f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
242f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
243f1af5d2fSBarry Smith     v += 9;
244f1af5d2fSBarry Smith 
245f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
246f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
247f1af5d2fSBarry Smith     while (nz--) {
248f1af5d2fSBarry Smith       oidx       = 3*(*vi++);
249f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
250f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
251f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252f1af5d2fSBarry Smith       v         += 9;
253f1af5d2fSBarry Smith     }
254f1af5d2fSBarry Smith     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
255f1af5d2fSBarry Smith     idx   += 3;
256f1af5d2fSBarry Smith   }
257f1af5d2fSBarry Smith   /* backward solve the L^T */
258f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
259f1af5d2fSBarry Smith     v   = aa + 9*diag[i] - 9;
260f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
261f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
262f1af5d2fSBarry Smith     idt = 3*i;
263f1af5d2fSBarry Smith     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
264f1af5d2fSBarry Smith     while (nz--) {
265f1af5d2fSBarry Smith       idx       = 3*(*vi--);
266f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
267f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
268f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269f1af5d2fSBarry Smith       v        -= 9;
270f1af5d2fSBarry Smith     }
271f1af5d2fSBarry Smith   }
2721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
273dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
274f1af5d2fSBarry Smith   PetscFunctionReturn(0);
275f1af5d2fSBarry Smith }
276f1af5d2fSBarry Smith 
2774a2ae208SSatish Balay #undef __FUNCT__
2784dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
2794dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2808499736aSShri Abhyankar {
2818499736aSShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
2828499736aSShri Abhyankar   PetscErrorCode  ierr;
283b3260449SShri Abhyankar   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2848499736aSShri Abhyankar   PetscInt        nz,idx,idt,j,i,oidx;
285b3260449SShri Abhyankar   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
286b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
287b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,x1,x2,x3,*x;
2888499736aSShri Abhyankar 
2898499736aSShri Abhyankar   PetscFunctionBegin;
2908499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2918499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2928499736aSShri Abhyankar 
2938499736aSShri Abhyankar   /* forward solve the U^T */
2948499736aSShri Abhyankar   idx = 0;
2958499736aSShri Abhyankar   for (i=0; i<n; i++) {
2968499736aSShri Abhyankar     v = aa + bs2*diag[i];
2978499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
2988499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
2998499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
3008499736aSShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
3018499736aSShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
3028499736aSShri Abhyankar     v -= bs2;
3038499736aSShri Abhyankar 
3048499736aSShri Abhyankar     vi = aj + diag[i] - 1;
3058499736aSShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
3068499736aSShri Abhyankar     for (j=0; j>-nz; j--) {
3078499736aSShri Abhyankar       oidx       = bs*vi[j];
3088499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3098499736aSShri Abhyankar       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3108499736aSShri Abhyankar       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3118499736aSShri Abhyankar       v         -= bs2;
3128499736aSShri Abhyankar     }
3138499736aSShri Abhyankar     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;
3148499736aSShri Abhyankar     idx   += bs;
3158499736aSShri Abhyankar   }
3168499736aSShri Abhyankar   /* backward solve the L^T */
3178499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
3188499736aSShri Abhyankar     v   = aa + bs2*ai[i];
3198499736aSShri Abhyankar     vi  = aj + ai[i];
3208499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
3218499736aSShri Abhyankar     idt = bs*i;
3228499736aSShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
3238499736aSShri Abhyankar     for (j=0; j<nz; j++) {
3248499736aSShri Abhyankar       idx       = bs*vi[j];
3258499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3268499736aSShri Abhyankar       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3278499736aSShri Abhyankar       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3288499736aSShri Abhyankar       v        += bs2;
3298499736aSShri Abhyankar     }
3308499736aSShri Abhyankar   }
3318499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3328499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3338499736aSShri Abhyankar   PetscFunctionReturn(0);
3348499736aSShri Abhyankar }
3358499736aSShri Abhyankar 
3368499736aSShri Abhyankar #undef __FUNCT__
33706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
33806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339f1af5d2fSBarry Smith {
340f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
341dfbe8321SBarry Smith   PetscErrorCode  ierr;
342b3260449SShri Abhyankar   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343b3260449SShri Abhyankar   PetscInt        i,nz,idx,idt,oidx;
344b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
345b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;
346f1af5d2fSBarry Smith 
347f1af5d2fSBarry Smith   PetscFunctionBegin;
348ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3491ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
350f1af5d2fSBarry Smith 
351f1af5d2fSBarry Smith   /* forward solve the U^T */
352f1af5d2fSBarry Smith   idx = 0;
353f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
354f1af5d2fSBarry Smith 
355f1af5d2fSBarry Smith     v = aa + 16*diag[i];
356f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
357ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
358f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
359f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
360f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
361f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362f1af5d2fSBarry Smith     v += 16;
363f1af5d2fSBarry Smith 
364f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
365f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
366f1af5d2fSBarry Smith     while (nz--) {
367f1af5d2fSBarry Smith       oidx       = 4*(*vi++);
368f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
369f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
370f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372f1af5d2fSBarry Smith       v         += 16;
373f1af5d2fSBarry Smith     }
374f1af5d2fSBarry Smith     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375f1af5d2fSBarry Smith     idx   += 4;
376f1af5d2fSBarry Smith   }
377f1af5d2fSBarry Smith   /* backward solve the L^T */
378f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
379f1af5d2fSBarry Smith     v   = aa + 16*diag[i] - 16;
380f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
381f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
382f1af5d2fSBarry Smith     idt = 4*i;
383f1af5d2fSBarry Smith     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384f1af5d2fSBarry Smith     while (nz--) {
385f1af5d2fSBarry Smith       idx       = 4*(*vi--);
386f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390f1af5d2fSBarry Smith       v        -= 16;
391f1af5d2fSBarry Smith     }
392f1af5d2fSBarry Smith   }
3931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
394dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
395f1af5d2fSBarry Smith   PetscFunctionReturn(0);
396f1af5d2fSBarry Smith }
397f1af5d2fSBarry Smith 
3984a2ae208SSatish Balay #undef __FUNCT__
3994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
4004dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4018499736aSShri Abhyankar {
4028499736aSShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
4038499736aSShri Abhyankar   PetscErrorCode  ierr;
404b3260449SShri Abhyankar   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
4058499736aSShri Abhyankar   PetscInt        nz,idx,idt,j,i,oidx;
406b3260449SShri Abhyankar   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
407b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
408b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;
4098499736aSShri Abhyankar 
4108499736aSShri Abhyankar   PetscFunctionBegin;
4118499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4128499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4138499736aSShri Abhyankar 
4148499736aSShri Abhyankar   /* forward solve the U^T */
4158499736aSShri Abhyankar   idx = 0;
4168499736aSShri Abhyankar   for (i=0; i<n; i++) {
4178499736aSShri Abhyankar     v = aa + bs2*diag[i];
4188499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
4198499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
4208499736aSShri Abhyankar     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
4218499736aSShri Abhyankar     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
4228499736aSShri Abhyankar     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
4238499736aSShri Abhyankar     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
4248499736aSShri Abhyankar     v -= bs2;
4258499736aSShri Abhyankar 
4268499736aSShri Abhyankar     vi = aj + diag[i] - 1;
4278499736aSShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
4288499736aSShri Abhyankar     for (j=0; j>-nz; j--) {
4298499736aSShri Abhyankar       oidx       = bs*vi[j];
4308499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4318499736aSShri Abhyankar       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4328499736aSShri Abhyankar       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4338499736aSShri Abhyankar       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4348499736aSShri Abhyankar       v         -= bs2;
4358499736aSShri Abhyankar     }
4368499736aSShri Abhyankar     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
4378499736aSShri Abhyankar     idx   += bs;
4388499736aSShri Abhyankar   }
4398499736aSShri Abhyankar   /* backward solve the L^T */
4408499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
4418499736aSShri Abhyankar     v   = aa + bs2*ai[i];
4428499736aSShri Abhyankar     vi  = aj + ai[i];
4438499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
4448499736aSShri Abhyankar     idt = bs*i;
4458499736aSShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
4468499736aSShri Abhyankar     for (j=0; j<nz; j++) {
4478499736aSShri Abhyankar       idx       = bs*vi[j];
4488499736aSShri Abhyankar       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4498499736aSShri Abhyankar       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4508499736aSShri Abhyankar       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4518499736aSShri Abhyankar       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4528499736aSShri Abhyankar       v        += bs2;
4538499736aSShri Abhyankar     }
4548499736aSShri Abhyankar   }
4558499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4568499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4578499736aSShri Abhyankar   PetscFunctionReturn(0);
4588499736aSShri Abhyankar }
4598499736aSShri Abhyankar 
4608499736aSShri Abhyankar #undef __FUNCT__
46106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
46206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463f1af5d2fSBarry Smith {
464f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
465dfbe8321SBarry Smith   PetscErrorCode  ierr;
466b3260449SShri Abhyankar   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467b3260449SShri Abhyankar   PetscInt        i,nz,idx,idt,oidx;
468b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
469b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
470f1af5d2fSBarry Smith 
471f1af5d2fSBarry Smith   PetscFunctionBegin;
472ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474f1af5d2fSBarry Smith 
475f1af5d2fSBarry Smith   /* forward solve the U^T */
476f1af5d2fSBarry Smith   idx = 0;
477f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
478f1af5d2fSBarry Smith 
479f1af5d2fSBarry Smith     v = aa + 25*diag[i];
480f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
481ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
483f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
484f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487f1af5d2fSBarry Smith     v += 25;
488f1af5d2fSBarry Smith 
489f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
490f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
491f1af5d2fSBarry Smith     while (nz--) {
492f1af5d2fSBarry Smith       oidx       = 5*(*vi++);
493f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
494f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
495f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498f1af5d2fSBarry Smith       v         += 25;
499f1af5d2fSBarry Smith     }
500f1af5d2fSBarry Smith     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501f1af5d2fSBarry Smith     idx   += 5;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
505f1af5d2fSBarry Smith     v   = aa + 25*diag[i] - 25;
506f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
508f1af5d2fSBarry Smith     idt = 5*i;
509f1af5d2fSBarry Smith     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510f1af5d2fSBarry Smith     while (nz--) {
511f1af5d2fSBarry Smith       idx       = 5*(*vi--);
512f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
513f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
514f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517f1af5d2fSBarry Smith       v        -= 25;
518f1af5d2fSBarry Smith     }
519f1af5d2fSBarry Smith   }
5201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
522f1af5d2fSBarry Smith   PetscFunctionReturn(0);
523f1af5d2fSBarry Smith }
524f1af5d2fSBarry Smith 
5254a2ae208SSatish Balay #undef __FUNCT__
5264dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
5274dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
5288499736aSShri Abhyankar {
5298499736aSShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
5308499736aSShri Abhyankar   PetscErrorCode  ierr;
531b3260449SShri Abhyankar   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5328499736aSShri Abhyankar   PetscInt        nz,idx,idt,j,i,oidx;
533b3260449SShri Abhyankar   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
534b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
535b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
5368499736aSShri Abhyankar 
5378499736aSShri Abhyankar   PetscFunctionBegin;
5388499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
5398499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5408499736aSShri Abhyankar 
5418499736aSShri Abhyankar   /* forward solve the U^T */
5428499736aSShri Abhyankar   idx = 0;
5438499736aSShri Abhyankar   for (i=0; i<n; i++) {
5448499736aSShri Abhyankar     v = aa + bs2*diag[i];
5458499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
5468499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
5478499736aSShri Abhyankar     x5 = x[4+idx];
5488499736aSShri Abhyankar     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
5498499736aSShri Abhyankar     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
5508499736aSShri Abhyankar     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
5518499736aSShri Abhyankar     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
5528499736aSShri Abhyankar     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
5538499736aSShri Abhyankar     v -= bs2;
5548499736aSShri Abhyankar 
5558499736aSShri Abhyankar     vi = aj + diag[i] - 1;
5568499736aSShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
5578499736aSShri Abhyankar     for (j=0; j>-nz; j--) {
5588499736aSShri Abhyankar       oidx       = bs*vi[j];
5598499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5608499736aSShri Abhyankar       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5618499736aSShri Abhyankar       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5628499736aSShri Abhyankar       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5638499736aSShri Abhyankar       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5648499736aSShri Abhyankar       v         -= bs2;
5658499736aSShri Abhyankar     }
5668499736aSShri Abhyankar     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
5678499736aSShri Abhyankar     idx   += bs;
5688499736aSShri Abhyankar   }
5698499736aSShri Abhyankar   /* backward solve the L^T */
5708499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
5718499736aSShri Abhyankar     v   = aa + bs2*ai[i];
5728499736aSShri Abhyankar     vi  = aj + ai[i];
5738499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
5748499736aSShri Abhyankar     idt = bs*i;
5758499736aSShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
5768499736aSShri Abhyankar     for (j=0; j<nz; j++) {
5778499736aSShri Abhyankar       idx       = bs*vi[j];
5788499736aSShri Abhyankar       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5798499736aSShri Abhyankar       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5808499736aSShri Abhyankar       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5818499736aSShri Abhyankar       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5828499736aSShri Abhyankar       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5838499736aSShri Abhyankar       v        += bs2;
5848499736aSShri Abhyankar     }
5858499736aSShri Abhyankar   }
5868499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5878499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5888499736aSShri Abhyankar   PetscFunctionReturn(0);
5898499736aSShri Abhyankar }
5908499736aSShri Abhyankar 
5918499736aSShri Abhyankar #undef __FUNCT__
59206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
59306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594f1af5d2fSBarry Smith {
595f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
596dfbe8321SBarry Smith   PetscErrorCode  ierr;
597b3260449SShri Abhyankar   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598b3260449SShri Abhyankar   PetscInt        i,nz,idx,idt,oidx;
599b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
600b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
601f1af5d2fSBarry Smith 
602f1af5d2fSBarry Smith   PetscFunctionBegin;
603ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
6041ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
605f1af5d2fSBarry Smith 
606f1af5d2fSBarry Smith   /* forward solve the U^T */
607f1af5d2fSBarry Smith   idx = 0;
608f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
609f1af5d2fSBarry Smith 
610f1af5d2fSBarry Smith     v = aa + 36*diag[i];
611f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
612ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613ef66eb69SBarry Smith     x6 = x[5+idx];
614f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
615f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
616f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620f1af5d2fSBarry Smith     v += 36;
621f1af5d2fSBarry Smith 
622f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
623f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
624f1af5d2fSBarry Smith     while (nz--) {
625f1af5d2fSBarry Smith       oidx       = 6*(*vi++);
626f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632f1af5d2fSBarry Smith       v         += 36;
633f1af5d2fSBarry Smith     }
634f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635f1af5d2fSBarry Smith     x[5+idx] = s6;
636f1af5d2fSBarry Smith     idx     += 6;
637f1af5d2fSBarry Smith   }
638f1af5d2fSBarry Smith   /* backward solve the L^T */
639f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
640f1af5d2fSBarry Smith     v   = aa + 36*diag[i] - 36;
641f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
642f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
643f1af5d2fSBarry Smith     idt = 6*i;
644f1af5d2fSBarry Smith     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645f1af5d2fSBarry Smith     s6  = x[5+idt];
646f1af5d2fSBarry Smith     while (nz--) {
647f1af5d2fSBarry Smith       idx       = 6*(*vi--);
648f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
649f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
650f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654f1af5d2fSBarry Smith       v        -= 36;
655f1af5d2fSBarry Smith     }
656f1af5d2fSBarry Smith   }
6571ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
658dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
659f1af5d2fSBarry Smith   PetscFunctionReturn(0);
660f1af5d2fSBarry Smith }
661f1af5d2fSBarry Smith 
6624a2ae208SSatish Balay #undef __FUNCT__
6634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
6644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
6658499736aSShri Abhyankar {
6668499736aSShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
6678499736aSShri Abhyankar   PetscErrorCode  ierr;
668b3260449SShri Abhyankar   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
6698499736aSShri Abhyankar   PetscInt        nz,idx,idt,j,i,oidx;
670b3260449SShri Abhyankar   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
671b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
672b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
6738499736aSShri Abhyankar 
6748499736aSShri Abhyankar   PetscFunctionBegin;
6758499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
6768499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
6778499736aSShri Abhyankar 
6788499736aSShri Abhyankar   /* forward solve the U^T */
6798499736aSShri Abhyankar   idx = 0;
6808499736aSShri Abhyankar   for (i=0; i<n; i++) {
6818499736aSShri Abhyankar     v = aa + bs2*diag[i];
6828499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
6838499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
6848499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];
6858499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
6868499736aSShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
6878499736aSShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
6888499736aSShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
6898499736aSShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
6908499736aSShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
6918499736aSShri Abhyankar     v -= bs2;
6928499736aSShri Abhyankar 
6938499736aSShri Abhyankar     vi = aj + diag[i] - 1;
6948499736aSShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
6958499736aSShri Abhyankar     for (j=0; j>-nz; j--) {
6968499736aSShri Abhyankar       oidx       = bs*vi[j];
6978499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
6988499736aSShri Abhyankar       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
6998499736aSShri Abhyankar       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7008499736aSShri Abhyankar       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7018499736aSShri Abhyankar       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7028499736aSShri Abhyankar       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7038499736aSShri Abhyankar       v         -= bs2;
7048499736aSShri Abhyankar     }
7058499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
7068499736aSShri Abhyankar     x[5+idx] = s6;
7078499736aSShri Abhyankar     idx     += bs;
7088499736aSShri Abhyankar   }
7098499736aSShri Abhyankar   /* backward solve the L^T */
7108499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
7118499736aSShri Abhyankar     v   = aa + bs2*ai[i];
7128499736aSShri Abhyankar     vi  = aj + ai[i];
7138499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
7148499736aSShri Abhyankar     idt = bs*i;
7158499736aSShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
7168499736aSShri Abhyankar     s6  = x[5+idt];
7178499736aSShri Abhyankar     for (j=0; j<nz; j++) {
7188499736aSShri Abhyankar       idx       = bs*vi[j];
7198499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
7208499736aSShri Abhyankar       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
7218499736aSShri Abhyankar       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7228499736aSShri Abhyankar       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7238499736aSShri Abhyankar       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7248499736aSShri Abhyankar       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7258499736aSShri Abhyankar       v        += bs2;
7268499736aSShri Abhyankar     }
7278499736aSShri Abhyankar   }
7288499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
7298499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
7308499736aSShri Abhyankar   PetscFunctionReturn(0);
7318499736aSShri Abhyankar }
7328499736aSShri Abhyankar 
7338499736aSShri Abhyankar #undef __FUNCT__
73406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
73506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736f1af5d2fSBarry Smith {
737f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
738dfbe8321SBarry Smith   PetscErrorCode  ierr;
739b3260449SShri Abhyankar   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740b3260449SShri Abhyankar   PetscInt        i,nz,idx,idt,oidx;
741b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
742b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
743f1af5d2fSBarry Smith 
744f1af5d2fSBarry Smith   PetscFunctionBegin;
745ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
7461ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith   /* forward solve the U^T */
749f1af5d2fSBarry Smith   idx = 0;
750f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
751f1af5d2fSBarry Smith 
752f1af5d2fSBarry Smith     v = aa + 49*diag[i];
753f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
754ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755ef66eb69SBarry Smith     x6 = x[5+idx]; x7 = x[6+idx];
756f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
757f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763f1af5d2fSBarry Smith     v += 49;
764f1af5d2fSBarry Smith 
765f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
766f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
767f1af5d2fSBarry Smith     while (nz--) {
768f1af5d2fSBarry Smith       oidx       = 7*(*vi++);
769f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
770f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776f1af5d2fSBarry Smith       v         += 49;
777f1af5d2fSBarry Smith     }
778f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
780f1af5d2fSBarry Smith     idx     += 7;
781f1af5d2fSBarry Smith   }
782f1af5d2fSBarry Smith   /* backward solve the L^T */
783f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
784f1af5d2fSBarry Smith     v   = aa + 49*diag[i] - 49;
785f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
786f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
787f1af5d2fSBarry Smith     idt = 7*i;
788f1af5d2fSBarry Smith     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789f1af5d2fSBarry Smith     s6  = x[5+idt];s7 = x[6+idt];
790f1af5d2fSBarry Smith     while (nz--) {
791f1af5d2fSBarry Smith       idx       = 7*(*vi--);
792f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
793f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799f1af5d2fSBarry Smith       v        -= 49;
800f1af5d2fSBarry Smith     }
801f1af5d2fSBarry Smith   }
8021ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
803dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
804f1af5d2fSBarry Smith   PetscFunctionReturn(0);
805f1af5d2fSBarry Smith }
8068499736aSShri Abhyankar #undef __FUNCT__
8074dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
8084dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
8098499736aSShri Abhyankar {
8108499736aSShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
8118499736aSShri Abhyankar   PetscErrorCode  ierr;
812b3260449SShri Abhyankar   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
8138499736aSShri Abhyankar   PetscInt        nz,idx,idt,j,i,oidx;
814b3260449SShri Abhyankar   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
815b3260449SShri Abhyankar   const MatScalar *aa=a->a,*v;
816b3260449SShri Abhyankar   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
8178499736aSShri Abhyankar 
8188499736aSShri Abhyankar   PetscFunctionBegin;
8198499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
8208499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
8218499736aSShri Abhyankar 
8228499736aSShri Abhyankar   /* forward solve the U^T */
8238499736aSShri Abhyankar   idx = 0;
8248499736aSShri Abhyankar   for (i=0; i<n; i++) {
8258499736aSShri Abhyankar     v = aa + bs2*diag[i];
8268499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
8278499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
8288499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
8298499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
8308499736aSShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
8318499736aSShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
8328499736aSShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
8338499736aSShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
8348499736aSShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
8358499736aSShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
8368499736aSShri Abhyankar     v -= bs2;
8378499736aSShri Abhyankar     vi = aj + diag[i] - 1;
8388499736aSShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
8398499736aSShri Abhyankar     for (j=0; j>-nz; j--) {
8408499736aSShri Abhyankar       oidx       = bs*vi[j];
8418499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8428499736aSShri Abhyankar       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8438499736aSShri Abhyankar       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8448499736aSShri Abhyankar       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8458499736aSShri Abhyankar       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8468499736aSShri Abhyankar       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8478499736aSShri Abhyankar       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8488499736aSShri Abhyankar       v         -= bs2;
8498499736aSShri Abhyankar     }
8508499736aSShri Abhyankar     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
8518499736aSShri Abhyankar     x[5+idx] = s6;  x[6+idx] = s7;
8528499736aSShri Abhyankar     idx     += bs;
8538499736aSShri Abhyankar   }
8548499736aSShri Abhyankar   /* backward solve the L^T */
8558499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
8568499736aSShri Abhyankar     v   = aa + bs2*ai[i];
8578499736aSShri Abhyankar     vi  = aj + ai[i];
8588499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
8598499736aSShri Abhyankar     idt = bs*i;
8608499736aSShri Abhyankar     s1  = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
8618499736aSShri Abhyankar     s6  = x[5+idt];  s7 = x[6+idt];
8628499736aSShri Abhyankar     for (j=0; j<nz; j++) {
8638499736aSShri Abhyankar       idx       = bs*vi[j];
8648499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8658499736aSShri Abhyankar       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8668499736aSShri Abhyankar       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8678499736aSShri Abhyankar       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8688499736aSShri Abhyankar       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8698499736aSShri Abhyankar       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8708499736aSShri Abhyankar       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8718499736aSShri Abhyankar       v        += bs2;
8728499736aSShri Abhyankar     }
8738499736aSShri Abhyankar   }
8748499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
8758499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
8768499736aSShri Abhyankar   PetscFunctionReturn(0);
8778499736aSShri Abhyankar }
878f1af5d2fSBarry Smith 
879f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
8804a2ae208SSatish Balay #undef __FUNCT__
88193fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
88293fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
88393fd935bSShri Abhyankar {
88493fd935bSShri Abhyankar   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
88593fd935bSShri Abhyankar   IS                iscol = a->col,isrow = a->row;
88693fd935bSShri Abhyankar   PetscErrorCode    ierr;
88793fd935bSShri Abhyankar   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
88893fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
88993fd935bSShri Abhyankar   PetscInt          nz;
89093fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
89193fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
89293fd935bSShri Abhyankar   const PetscScalar *b;
89393fd935bSShri Abhyankar 
89493fd935bSShri Abhyankar   PetscFunctionBegin;
8953649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
89693fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
89793fd935bSShri Abhyankar   tmp  = a->solve_work;
89893fd935bSShri Abhyankar 
89993fd935bSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
90093fd935bSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
90193fd935bSShri Abhyankar 
90293fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
90393fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[c[i]];
90493fd935bSShri Abhyankar 
90593fd935bSShri Abhyankar   /* forward solve the U^T */
90693fd935bSShri Abhyankar   for (i=0; i<n; i++) {
90793fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
90893fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
90993fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
91093fd935bSShri Abhyankar     s1  = tmp[i];
91193fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
91293fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
91393fd935bSShri Abhyankar     tmp[i] = s1;
91493fd935bSShri Abhyankar   }
91593fd935bSShri Abhyankar 
91693fd935bSShri Abhyankar   /* backward solve the L^T */
91793fd935bSShri Abhyankar   for (i=n-1; i>=0; i--) {
91893fd935bSShri Abhyankar     v  = aa + ai[i];
91993fd935bSShri Abhyankar     vi = aj + ai[i];
92093fd935bSShri Abhyankar     nz = ai[i+1] - ai[i];
92193fd935bSShri Abhyankar     s1 = tmp[i];
92293fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
92393fd935bSShri Abhyankar   }
92493fd935bSShri Abhyankar 
92593fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
92693fd935bSShri Abhyankar   for (i=0; i<n; i++) x[r[i]] = tmp[i];
92793fd935bSShri Abhyankar 
92893fd935bSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
92993fd935bSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9303649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
93193fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
93293fd935bSShri Abhyankar 
93393fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
93493fd935bSShri Abhyankar   PetscFunctionReturn(0);
93593fd935bSShri Abhyankar }
93693fd935bSShri Abhyankar 
93793fd935bSShri Abhyankar #undef __FUNCT__
93806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
93906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940f1af5d2fSBarry Smith {
941f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
942f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
9436849ba73SBarry Smith   PetscErrorCode    ierr;
9445d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
945b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946b3260449SShri Abhyankar   PetscInt          i,nz;
947b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
948b3260449SShri Abhyankar   PetscScalar       s1,*x,*t;
949b3260449SShri Abhyankar   const PetscScalar *b;
950f1af5d2fSBarry Smith 
951f1af5d2fSBarry Smith   PetscFunctionBegin;
9523649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
9531ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
954f1af5d2fSBarry Smith   t    = a->solve_work;
955f1af5d2fSBarry Smith 
956f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
957f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
958f1af5d2fSBarry Smith 
959f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
960*26fbe8dcSKarl Rupp   for (i=0; i<n; i++) t[i] = b[c[i]];
961f1af5d2fSBarry Smith 
962f1af5d2fSBarry Smith   /* forward solve the U^T */
963f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
964f1af5d2fSBarry Smith 
965f1af5d2fSBarry Smith     v = aa + diag[i];
966f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
967f1af5d2fSBarry Smith     s1 = (*v++)*t[i];
968f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
969f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
970f1af5d2fSBarry Smith     while (nz--) {
971f1af5d2fSBarry Smith       t[*vi++] -= (*v++)*s1;
972f1af5d2fSBarry Smith     }
973f1af5d2fSBarry Smith     t[i] = s1;
974f1af5d2fSBarry Smith   }
975f1af5d2fSBarry Smith   /* backward solve the L^T */
976f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
977f1af5d2fSBarry Smith     v  = aa + diag[i] - 1;
978f1af5d2fSBarry Smith     vi = aj + diag[i] - 1;
979f1af5d2fSBarry Smith     nz = diag[i] - ai[i];
980f1af5d2fSBarry Smith     s1 = t[i];
981f1af5d2fSBarry Smith     while (nz--) {
982f1af5d2fSBarry Smith       t[*vi--] -=  (*v--)*s1;
983f1af5d2fSBarry Smith     }
984f1af5d2fSBarry Smith   }
985f1af5d2fSBarry Smith 
986f1af5d2fSBarry Smith   /* copy t into x according to permutation */
987*26fbe8dcSKarl Rupp   for (i=0; i<n; i++) x[r[i]] = t[i];
988f1af5d2fSBarry Smith 
989f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
990f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9913649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
9921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
993dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
994f1af5d2fSBarry Smith   PetscFunctionReturn(0);
995f1af5d2fSBarry Smith }
996f1af5d2fSBarry Smith 
9974a2ae208SSatish Balay #undef __FUNCT__
99806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
99906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1000f1af5d2fSBarry Smith {
1001f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1002f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
10036849ba73SBarry Smith   PetscErrorCode    ierr;
10045d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1005b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1006b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1007b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1008b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1009b3260449SShri Abhyankar   const PetscScalar *b;
1010f1af5d2fSBarry Smith 
1011f1af5d2fSBarry Smith   PetscFunctionBegin;
10123649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
10131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1014f1af5d2fSBarry Smith   t    = a->solve_work;
1015f1af5d2fSBarry Smith 
1016f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1017f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1018f1af5d2fSBarry Smith 
1019f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1020f1af5d2fSBarry Smith   ii = 0;
1021f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1022f1af5d2fSBarry Smith     ic      = 2*c[i];
1023f1af5d2fSBarry Smith     t[ii]   = b[ic];
1024f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1025f1af5d2fSBarry Smith     ii     += 2;
1026f1af5d2fSBarry Smith   }
1027f1af5d2fSBarry Smith 
1028f1af5d2fSBarry Smith   /* forward solve the U^T */
1029f1af5d2fSBarry Smith   idx = 0;
1030f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1031f1af5d2fSBarry Smith 
1032f1af5d2fSBarry Smith     v = aa + 4*diag[i];
1033f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1034f1af5d2fSBarry Smith     x1 = t[idx];   x2 = t[1+idx];
1035f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
1036f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
1037f1af5d2fSBarry Smith     v += 4;
1038f1af5d2fSBarry Smith 
1039f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
1040f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
1041f1af5d2fSBarry Smith     while (nz--) {
1042f1af5d2fSBarry Smith       oidx       = 2*(*vi++);
1043f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1044f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1045f1af5d2fSBarry Smith       v         += 4;
1046f1af5d2fSBarry Smith     }
1047f1af5d2fSBarry Smith     t[idx] = s1;t[1+idx] = s2;
1048f1af5d2fSBarry Smith     idx   += 2;
1049f1af5d2fSBarry Smith   }
1050f1af5d2fSBarry Smith   /* backward solve the L^T */
1051f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1052f1af5d2fSBarry Smith     v   = aa + 4*diag[i] - 4;
1053f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
1054f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
1055f1af5d2fSBarry Smith     idt = 2*i;
1056f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt];
1057f1af5d2fSBarry Smith     while (nz--) {
1058f1af5d2fSBarry Smith       idx       = 2*(*vi--);
1059f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1060f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1061f1af5d2fSBarry Smith       v        -= 4;
1062f1af5d2fSBarry Smith     }
1063f1af5d2fSBarry Smith   }
1064f1af5d2fSBarry Smith 
1065f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1066f1af5d2fSBarry Smith   ii = 0;
1067f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1068f1af5d2fSBarry Smith     ir      = 2*r[i];
1069f1af5d2fSBarry Smith     x[ir]   = t[ii];
1070f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1071f1af5d2fSBarry Smith     ii     += 2;
1072f1af5d2fSBarry Smith   }
1073f1af5d2fSBarry Smith 
1074f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1075f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
10763649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
10771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1078dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1079f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1080f1af5d2fSBarry Smith }
1081f1af5d2fSBarry Smith 
10824a2ae208SSatish Balay #undef __FUNCT__
10834dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
10844dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
108532121132SShri Abhyankar {
108632121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
108732121132SShri Abhyankar   PetscErrorCode    ierr;
108832121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1089b3260449SShri Abhyankar   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
109032121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
109132121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1092b3260449SShri Abhyankar   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1093b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1094b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1095b3260449SShri Abhyankar   const PetscScalar *b;
109632121132SShri Abhyankar 
109732121132SShri Abhyankar   PetscFunctionBegin;
10983649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
109932121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
110032121132SShri Abhyankar   t    = a->solve_work;
110132121132SShri Abhyankar 
110232121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
110332121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
110432121132SShri Abhyankar 
110532121132SShri Abhyankar   /* copy b into temp work space according to permutation */
110632121132SShri Abhyankar   for (i=0; i<n; i++) {
110732121132SShri Abhyankar     ii    = bs*i; ic = bs*c[i];
110832121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1];
110932121132SShri Abhyankar   }
111032121132SShri Abhyankar 
111132121132SShri Abhyankar   /* forward solve the U^T */
111232121132SShri Abhyankar   idx = 0;
111332121132SShri Abhyankar   for (i=0; i<n; i++) {
111432121132SShri Abhyankar     v = aa + bs2*diag[i];
111532121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
111632121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx];
111732121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
111832121132SShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
111932121132SShri Abhyankar     v -= bs2;
112032121132SShri Abhyankar 
112132121132SShri Abhyankar     vi = aj + diag[i] - 1;
112232121132SShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
112332121132SShri Abhyankar     for (j=0; j>-nz; j--) {
112432121132SShri Abhyankar       oidx       = bs*vi[j];
112532121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2;
112632121132SShri Abhyankar       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
112732121132SShri Abhyankar       v         -= bs2;
112832121132SShri Abhyankar     }
112932121132SShri Abhyankar     t[idx] = s1;t[1+idx] = s2;
113032121132SShri Abhyankar     idx   += bs;
113132121132SShri Abhyankar   }
113232121132SShri Abhyankar   /* backward solve the L^T */
113332121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
113432121132SShri Abhyankar     v   = aa + bs2*ai[i];
113532121132SShri Abhyankar     vi  = aj + ai[i];
113632121132SShri Abhyankar     nz  = ai[i+1] - ai[i];
113732121132SShri Abhyankar     idt = bs*i;
113832121132SShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];
113932121132SShri Abhyankar     for (j=0; j<nz; j++) {
114032121132SShri Abhyankar       idx       = bs*vi[j];
114132121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2;
114232121132SShri Abhyankar       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
114332121132SShri Abhyankar       v        += bs2;
114432121132SShri Abhyankar     }
114532121132SShri Abhyankar   }
114632121132SShri Abhyankar 
114732121132SShri Abhyankar   /* copy t into x according to permutation */
114832121132SShri Abhyankar   for (i=0; i<n; i++) {
114932121132SShri Abhyankar     ii    = bs*i;  ir = bs*r[i];
115032121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1];
115132121132SShri Abhyankar   }
115232121132SShri Abhyankar 
115332121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
115432121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11553649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
115632121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
115732121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
115832121132SShri Abhyankar   PetscFunctionReturn(0);
115932121132SShri Abhyankar }
116032121132SShri Abhyankar 
116132121132SShri Abhyankar #undef __FUNCT__
116206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
116306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1164f1af5d2fSBarry Smith {
1165f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1166f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
11676849ba73SBarry Smith   PetscErrorCode    ierr;
11685d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1169b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1170b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1171b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1172b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1173b3260449SShri Abhyankar   const PetscScalar *b;
1174f1af5d2fSBarry Smith 
1175f1af5d2fSBarry Smith   PetscFunctionBegin;
11763649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
11771ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1178f1af5d2fSBarry Smith   t    = a->solve_work;
1179f1af5d2fSBarry Smith 
1180f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1181f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1182f1af5d2fSBarry Smith 
1183f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1184f1af5d2fSBarry Smith   ii = 0;
1185f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1186f1af5d2fSBarry Smith     ic      = 3*c[i];
1187f1af5d2fSBarry Smith     t[ii]   = b[ic];
1188f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1189f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1190f1af5d2fSBarry Smith     ii     += 3;
1191f1af5d2fSBarry Smith   }
1192f1af5d2fSBarry Smith 
1193f1af5d2fSBarry Smith   /* forward solve the U^T */
1194f1af5d2fSBarry Smith   idx = 0;
1195f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1196f1af5d2fSBarry Smith 
1197f1af5d2fSBarry Smith     v = aa + 9*diag[i];
1198f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1199f1af5d2fSBarry Smith     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1200f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1201f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1202f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1203f1af5d2fSBarry Smith     v += 9;
1204f1af5d2fSBarry Smith 
1205f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
1206f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
1207f1af5d2fSBarry Smith     while (nz--) {
1208f1af5d2fSBarry Smith       oidx       = 3*(*vi++);
1209f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1210f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1211f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1212f1af5d2fSBarry Smith       v         += 9;
1213f1af5d2fSBarry Smith     }
1214f1af5d2fSBarry Smith     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
1215f1af5d2fSBarry Smith     idx   += 3;
1216f1af5d2fSBarry Smith   }
1217f1af5d2fSBarry Smith   /* backward solve the L^T */
1218f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1219f1af5d2fSBarry Smith     v   = aa + 9*diag[i] - 9;
1220f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
1221f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
1222f1af5d2fSBarry Smith     idt = 3*i;
1223f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1224f1af5d2fSBarry Smith     while (nz--) {
1225f1af5d2fSBarry Smith       idx       = 3*(*vi--);
1226f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1227f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1228f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1229f1af5d2fSBarry Smith       v        -= 9;
1230f1af5d2fSBarry Smith     }
1231f1af5d2fSBarry Smith   }
1232f1af5d2fSBarry Smith 
1233f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1234f1af5d2fSBarry Smith   ii = 0;
1235f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1236f1af5d2fSBarry Smith     ir      = 3*r[i];
1237f1af5d2fSBarry Smith     x[ir]   = t[ii];
1238f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1239f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1240f1af5d2fSBarry Smith     ii     += 3;
1241f1af5d2fSBarry Smith   }
1242f1af5d2fSBarry Smith 
1243f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1244f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12453649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
12461ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1247dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1248f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1249f1af5d2fSBarry Smith }
1250f1af5d2fSBarry Smith 
12514a2ae208SSatish Balay #undef __FUNCT__
12524dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
12534dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
125432121132SShri Abhyankar {
125532121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
125632121132SShri Abhyankar   PetscErrorCode    ierr;
125732121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1258b3260449SShri Abhyankar   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
125932121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
126032121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1261b3260449SShri Abhyankar   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1262b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1263b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1264b3260449SShri Abhyankar   const PetscScalar *b;
126532121132SShri Abhyankar 
126632121132SShri Abhyankar   PetscFunctionBegin;
12673649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
126832121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126932121132SShri Abhyankar   t    = a->solve_work;
127032121132SShri Abhyankar 
127132121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
127232121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
127332121132SShri Abhyankar 
127432121132SShri Abhyankar   /* copy b into temp work space according to permutation */
127532121132SShri Abhyankar   for (i=0; i<n; i++) {
127632121132SShri Abhyankar     ii    = bs*i; ic = bs*c[i];
127732121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
127832121132SShri Abhyankar   }
127932121132SShri Abhyankar 
128032121132SShri Abhyankar   /* forward solve the U^T */
128132121132SShri Abhyankar   idx = 0;
128232121132SShri Abhyankar   for (i=0; i<n; i++) {
128332121132SShri Abhyankar     v = aa + bs2*diag[i];
128432121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
128532121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
128632121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
128732121132SShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
128832121132SShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
128932121132SShri Abhyankar     v -= bs2;
129032121132SShri Abhyankar 
129132121132SShri Abhyankar     vi = aj + diag[i] - 1;
129232121132SShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
129332121132SShri Abhyankar     for (j=0; j>-nz; j--) {
129432121132SShri Abhyankar       oidx       = bs*vi[j];
129532121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
129632121132SShri Abhyankar       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
129732121132SShri Abhyankar       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
129832121132SShri Abhyankar       v         -= bs2;
129932121132SShri Abhyankar     }
130032121132SShri Abhyankar     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;
130132121132SShri Abhyankar     idx   += bs;
130232121132SShri Abhyankar   }
130332121132SShri Abhyankar   /* backward solve the L^T */
130432121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
130532121132SShri Abhyankar     v   = aa + bs2*ai[i];
130632121132SShri Abhyankar     vi  = aj + ai[i];
130732121132SShri Abhyankar     nz  = ai[i+1] - ai[i];
130832121132SShri Abhyankar     idt = bs*i;
130932121132SShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
131032121132SShri Abhyankar     for (j=0; j<nz; j++) {
131132121132SShri Abhyankar       idx       = bs*vi[j];
131232121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
131332121132SShri Abhyankar       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
131432121132SShri Abhyankar       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
131532121132SShri Abhyankar       v        += bs2;
131632121132SShri Abhyankar     }
131732121132SShri Abhyankar   }
131832121132SShri Abhyankar 
131932121132SShri Abhyankar   /* copy t into x according to permutation */
132032121132SShri Abhyankar   for (i=0; i<n; i++) {
132132121132SShri Abhyankar     ii    = bs*i;  ir = bs*r[i];
132232121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
132332121132SShri Abhyankar   }
132432121132SShri Abhyankar 
132532121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
132632121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13273649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
132832121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
132932121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
133032121132SShri Abhyankar   PetscFunctionReturn(0);
133132121132SShri Abhyankar }
133232121132SShri Abhyankar 
133332121132SShri Abhyankar #undef __FUNCT__
133406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
133506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1336f1af5d2fSBarry Smith {
1337f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1338f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
13396849ba73SBarry Smith   PetscErrorCode    ierr;
13405d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1341b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1342b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1343b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1344b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1345b3260449SShri Abhyankar   const PetscScalar *b;
1346f1af5d2fSBarry Smith 
1347f1af5d2fSBarry Smith   PetscFunctionBegin;
13483649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
13491ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1350f1af5d2fSBarry Smith   t    = a->solve_work;
1351f1af5d2fSBarry Smith 
1352f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1353f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1354f1af5d2fSBarry Smith 
1355f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1356f1af5d2fSBarry Smith   ii = 0;
1357f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1358f1af5d2fSBarry Smith     ic      = 4*c[i];
1359f1af5d2fSBarry Smith     t[ii]   = b[ic];
1360f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1361f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1362f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1363f1af5d2fSBarry Smith     ii     += 4;
1364f1af5d2fSBarry Smith   }
1365f1af5d2fSBarry Smith 
1366f1af5d2fSBarry Smith   /* forward solve the U^T */
1367f1af5d2fSBarry Smith   idx = 0;
1368f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1369f1af5d2fSBarry Smith 
1370f1af5d2fSBarry Smith     v = aa + 16*diag[i];
1371f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1372f1af5d2fSBarry Smith     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1373f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1374f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1375f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1376f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1377f1af5d2fSBarry Smith     v += 16;
1378f1af5d2fSBarry Smith 
1379f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
1380f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
1381f1af5d2fSBarry Smith     while (nz--) {
1382f1af5d2fSBarry Smith       oidx       = 4*(*vi++);
1383f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1384f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1385f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1386f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1387f1af5d2fSBarry Smith       v         += 16;
1388f1af5d2fSBarry Smith     }
1389f1af5d2fSBarry Smith     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1390f1af5d2fSBarry Smith     idx   += 4;
1391f1af5d2fSBarry Smith   }
1392f1af5d2fSBarry Smith   /* backward solve the L^T */
1393f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1394f1af5d2fSBarry Smith     v   = aa + 16*diag[i] - 16;
1395f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
1396f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
1397f1af5d2fSBarry Smith     idt = 4*i;
1398f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1399f1af5d2fSBarry Smith     while (nz--) {
1400f1af5d2fSBarry Smith       idx       = 4*(*vi--);
1401f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1402f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1403f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1404f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1405f1af5d2fSBarry Smith       v        -= 16;
1406f1af5d2fSBarry Smith     }
1407f1af5d2fSBarry Smith   }
1408f1af5d2fSBarry Smith 
1409f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1410f1af5d2fSBarry Smith   ii = 0;
1411f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1412f1af5d2fSBarry Smith     ir      = 4*r[i];
1413f1af5d2fSBarry Smith     x[ir]   = t[ii];
1414f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1415f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1416f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1417f1af5d2fSBarry Smith     ii     += 4;
1418f1af5d2fSBarry Smith   }
1419f1af5d2fSBarry Smith 
1420f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1421f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
14223649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
14231ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1424dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1425f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1426f1af5d2fSBarry Smith }
1427f1af5d2fSBarry Smith 
14284a2ae208SSatish Balay #undef __FUNCT__
14294dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
14304dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
143132121132SShri Abhyankar {
143232121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
143332121132SShri Abhyankar   PetscErrorCode    ierr;
143432121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1435b3260449SShri Abhyankar   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
143632121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
143732121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1438b3260449SShri Abhyankar   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1439b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1440b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1441b3260449SShri Abhyankar   const PetscScalar *b;
144232121132SShri Abhyankar 
144332121132SShri Abhyankar   PetscFunctionBegin;
14443649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
144532121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
144632121132SShri Abhyankar   t    = a->solve_work;
144732121132SShri Abhyankar 
144832121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
144932121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
145032121132SShri Abhyankar 
145132121132SShri Abhyankar   /* copy b into temp work space according to permutation */
145232121132SShri Abhyankar   for (i=0; i<n; i++) {
145332121132SShri Abhyankar     ii    = bs*i; ic = bs*c[i];
145432121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
145532121132SShri Abhyankar   }
145632121132SShri Abhyankar 
145732121132SShri Abhyankar   /* forward solve the U^T */
145832121132SShri Abhyankar   idx = 0;
145932121132SShri Abhyankar   for (i=0; i<n; i++) {
146032121132SShri Abhyankar     v = aa + bs2*diag[i];
146132121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
146232121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
146332121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
146432121132SShri Abhyankar     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
146532121132SShri Abhyankar     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
146632121132SShri Abhyankar     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
146732121132SShri Abhyankar     v -= bs2;
146832121132SShri Abhyankar 
146932121132SShri Abhyankar     vi = aj + diag[i] - 1;
147032121132SShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
147132121132SShri Abhyankar     for (j=0; j>-nz; j--) {
147232121132SShri Abhyankar       oidx       = bs*vi[j];
147332121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
147432121132SShri Abhyankar       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
147532121132SShri Abhyankar       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
147632121132SShri Abhyankar       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
147732121132SShri Abhyankar       v         -= bs2;
147832121132SShri Abhyankar     }
147932121132SShri Abhyankar     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
148032121132SShri Abhyankar     idx   += bs;
148132121132SShri Abhyankar   }
148232121132SShri Abhyankar   /* backward solve the L^T */
148332121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
148432121132SShri Abhyankar     v   = aa + bs2*ai[i];
148532121132SShri Abhyankar     vi  = aj + ai[i];
148632121132SShri Abhyankar     nz  = ai[i+1] - ai[i];
148732121132SShri Abhyankar     idt = bs*i;
148832121132SShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
148932121132SShri Abhyankar     for (j=0; j<nz; j++) {
149032121132SShri Abhyankar       idx       = bs*vi[j];
149132121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
149232121132SShri Abhyankar       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
149332121132SShri Abhyankar       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
149432121132SShri Abhyankar       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
149532121132SShri Abhyankar       v        += bs2;
149632121132SShri Abhyankar     }
149732121132SShri Abhyankar   }
149832121132SShri Abhyankar 
149932121132SShri Abhyankar   /* copy t into x according to permutation */
150032121132SShri Abhyankar   for (i=0; i<n; i++) {
150132121132SShri Abhyankar     ii    = bs*i;  ir = bs*r[i];
150232121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
150332121132SShri Abhyankar   }
150432121132SShri Abhyankar 
150532121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
150632121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
15073649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
150832121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
150932121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
151032121132SShri Abhyankar   PetscFunctionReturn(0);
151132121132SShri Abhyankar }
151232121132SShri Abhyankar 
151332121132SShri Abhyankar #undef __FUNCT__
151406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
151506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1516f1af5d2fSBarry Smith {
1517f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1518f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
15196849ba73SBarry Smith   PetscErrorCode    ierr;
15205d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1521b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1522b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1523b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1524b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1525b3260449SShri Abhyankar   const PetscScalar *b;
1526f1af5d2fSBarry Smith 
1527f1af5d2fSBarry Smith   PetscFunctionBegin;
15283649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
15291ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1530f1af5d2fSBarry Smith   t    = a->solve_work;
1531f1af5d2fSBarry Smith 
1532f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1533f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1534f1af5d2fSBarry Smith 
1535f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1536f1af5d2fSBarry Smith   ii = 0;
1537f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1538f1af5d2fSBarry Smith     ic      = 5*c[i];
1539f1af5d2fSBarry Smith     t[ii]   = b[ic];
1540f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1541f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1542f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1543f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1544f1af5d2fSBarry Smith     ii     += 5;
1545f1af5d2fSBarry Smith   }
1546f1af5d2fSBarry Smith 
1547f1af5d2fSBarry Smith   /* forward solve the U^T */
1548f1af5d2fSBarry Smith   idx = 0;
1549f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1550f1af5d2fSBarry Smith 
1551f1af5d2fSBarry Smith     v = aa + 25*diag[i];
1552f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1553f1af5d2fSBarry Smith     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1554f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1555f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1556f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1557f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1558f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1559f1af5d2fSBarry Smith     v += 25;
1560f1af5d2fSBarry Smith 
1561f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
1562f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
1563f1af5d2fSBarry Smith     while (nz--) {
1564f1af5d2fSBarry Smith       oidx       = 5*(*vi++);
1565f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1566f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1567f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1568f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1569f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1570f1af5d2fSBarry Smith       v         += 25;
1571f1af5d2fSBarry Smith     }
1572f1af5d2fSBarry Smith     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1573f1af5d2fSBarry Smith     idx   += 5;
1574f1af5d2fSBarry Smith   }
1575f1af5d2fSBarry Smith   /* backward solve the L^T */
1576f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1577f1af5d2fSBarry Smith     v   = aa + 25*diag[i] - 25;
1578f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
1579f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
1580f1af5d2fSBarry Smith     idt = 5*i;
1581f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1582f1af5d2fSBarry Smith     while (nz--) {
1583f1af5d2fSBarry Smith       idx       = 5*(*vi--);
1584f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1585f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1586f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1587f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1588f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1589f1af5d2fSBarry Smith       v        -= 25;
1590f1af5d2fSBarry Smith     }
1591f1af5d2fSBarry Smith   }
1592f1af5d2fSBarry Smith 
1593f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1594f1af5d2fSBarry Smith   ii = 0;
1595f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1596f1af5d2fSBarry Smith     ir      = 5*r[i];
1597f1af5d2fSBarry Smith     x[ir]   = t[ii];
1598f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1599f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1600f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1601f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1602f1af5d2fSBarry Smith     ii     += 5;
1603f1af5d2fSBarry Smith   }
1604f1af5d2fSBarry Smith 
1605f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1606f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
16073649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
16081ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1609dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1610f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1611f1af5d2fSBarry Smith }
1612f1af5d2fSBarry Smith 
16134a2ae208SSatish Balay #undef __FUNCT__
16144dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
16154dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
161632121132SShri Abhyankar {
161732121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
161832121132SShri Abhyankar   PetscErrorCode    ierr;
161932121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1620b3260449SShri Abhyankar   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
162132121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
162232121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1623b3260449SShri Abhyankar   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1624b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1625b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1626b3260449SShri Abhyankar   const PetscScalar *b;
162732121132SShri Abhyankar 
162832121132SShri Abhyankar   PetscFunctionBegin;
16293649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
163032121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
163132121132SShri Abhyankar   t    = a->solve_work;
163232121132SShri Abhyankar 
163332121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
163432121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
163532121132SShri Abhyankar 
163632121132SShri Abhyankar   /* copy b into temp work space according to permutation */
163732121132SShri Abhyankar   for (i=0; i<n; i++) {
163832121132SShri Abhyankar     ii      = bs*i; ic = bs*c[i];
163932121132SShri Abhyankar     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
164032121132SShri Abhyankar     t[ii+4] = b[ic+4];
164132121132SShri Abhyankar   }
164232121132SShri Abhyankar 
164332121132SShri Abhyankar   /* forward solve the U^T */
164432121132SShri Abhyankar   idx = 0;
164532121132SShri Abhyankar   for (i=0; i<n; i++) {
164632121132SShri Abhyankar     v = aa + bs2*diag[i];
164732121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
164832121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
164932121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
165032121132SShri Abhyankar     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
165132121132SShri Abhyankar     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
165232121132SShri Abhyankar     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
165332121132SShri Abhyankar     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
165432121132SShri Abhyankar     v -= bs2;
165532121132SShri Abhyankar 
165632121132SShri Abhyankar     vi = aj + diag[i] - 1;
165732121132SShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
165832121132SShri Abhyankar     for (j=0; j>-nz; j--) {
165932121132SShri Abhyankar       oidx       = bs*vi[j];
166032121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
166132121132SShri Abhyankar       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
166232121132SShri Abhyankar       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
166332121132SShri Abhyankar       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
166432121132SShri Abhyankar       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
166532121132SShri Abhyankar       v         -= bs2;
166632121132SShri Abhyankar     }
166732121132SShri Abhyankar     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
166832121132SShri Abhyankar     idx   += bs;
166932121132SShri Abhyankar   }
167032121132SShri Abhyankar   /* backward solve the L^T */
167132121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
167232121132SShri Abhyankar     v   = aa + bs2*ai[i];
167332121132SShri Abhyankar     vi  = aj + ai[i];
167432121132SShri Abhyankar     nz  = ai[i+1] - ai[i];
167532121132SShri Abhyankar     idt = bs*i;
167632121132SShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
167732121132SShri Abhyankar     for (j=0; j<nz; j++) {
167832121132SShri Abhyankar       idx       = bs*vi[j];
167932121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
168032121132SShri Abhyankar       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
168132121132SShri Abhyankar       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
168232121132SShri Abhyankar       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
168332121132SShri Abhyankar       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
168432121132SShri Abhyankar       v        += bs2;
168532121132SShri Abhyankar     }
168632121132SShri Abhyankar   }
168732121132SShri Abhyankar 
168832121132SShri Abhyankar   /* copy t into x according to permutation */
168932121132SShri Abhyankar   for (i=0; i<n; i++) {
169032121132SShri Abhyankar     ii      = bs*i;  ir = bs*r[i];
169132121132SShri Abhyankar     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
169232121132SShri Abhyankar     x[ir+4] = t[ii+4];
169332121132SShri Abhyankar   }
169432121132SShri Abhyankar 
169532121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
169632121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
16973649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
169832121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
169932121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
170032121132SShri Abhyankar   PetscFunctionReturn(0);
170132121132SShri Abhyankar }
170232121132SShri Abhyankar 
170332121132SShri Abhyankar #undef __FUNCT__
170406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
170506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1706f1af5d2fSBarry Smith {
1707f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1708f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
17096849ba73SBarry Smith   PetscErrorCode    ierr;
17105d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1711b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1712b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1713b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1714b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1715b3260449SShri Abhyankar   const PetscScalar *b;
1716f1af5d2fSBarry Smith 
1717f1af5d2fSBarry Smith   PetscFunctionBegin;
17183649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
17191ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1720f1af5d2fSBarry Smith   t    = a->solve_work;
1721f1af5d2fSBarry Smith 
1722f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1723f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1724f1af5d2fSBarry Smith 
1725f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1726f1af5d2fSBarry Smith   ii = 0;
1727f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1728f1af5d2fSBarry Smith     ic      = 6*c[i];
1729f1af5d2fSBarry Smith     t[ii]   = b[ic];
1730f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1731f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1732f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1733f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1734f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1735f1af5d2fSBarry Smith     ii     += 6;
1736f1af5d2fSBarry Smith   }
1737f1af5d2fSBarry Smith 
1738f1af5d2fSBarry Smith   /* forward solve the U^T */
1739f1af5d2fSBarry Smith   idx = 0;
1740f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1741f1af5d2fSBarry Smith 
1742f1af5d2fSBarry Smith     v = aa + 36*diag[i];
1743f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1744f1af5d2fSBarry Smith     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1745f1af5d2fSBarry Smith     x6 = t[5+idx];
1746f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1747f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1748f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1749f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1750f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1751f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1752f1af5d2fSBarry Smith     v += 36;
1753f1af5d2fSBarry Smith 
1754f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
1755f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
1756f1af5d2fSBarry Smith     while (nz--) {
1757f1af5d2fSBarry Smith       oidx       = 6*(*vi++);
1758f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1759f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1760f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1761f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1762f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1763f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1764f1af5d2fSBarry Smith       v         += 36;
1765f1af5d2fSBarry Smith     }
1766f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1767f1af5d2fSBarry Smith     t[5+idx] = s6;
1768f1af5d2fSBarry Smith     idx     += 6;
1769f1af5d2fSBarry Smith   }
1770f1af5d2fSBarry Smith   /* backward solve the L^T */
1771f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1772f1af5d2fSBarry Smith     v   = aa + 36*diag[i] - 36;
1773f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
1774f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
1775f1af5d2fSBarry Smith     idt = 6*i;
1776f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1777f1af5d2fSBarry Smith     s6  = t[5+idt];
1778f1af5d2fSBarry Smith     while (nz--) {
1779f1af5d2fSBarry Smith       idx       = 6*(*vi--);
1780f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1781f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1782f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1783f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1784f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1785f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1786f1af5d2fSBarry Smith       v        -= 36;
1787f1af5d2fSBarry Smith     }
1788f1af5d2fSBarry Smith   }
1789f1af5d2fSBarry Smith 
1790f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1791f1af5d2fSBarry Smith   ii = 0;
1792f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1793f1af5d2fSBarry Smith     ir      = 6*r[i];
1794f1af5d2fSBarry Smith     x[ir]   = t[ii];
1795f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1796f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1797f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1798f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1799f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1800f1af5d2fSBarry Smith     ii     += 6;
1801f1af5d2fSBarry Smith   }
1802f1af5d2fSBarry Smith 
1803f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1804f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18053649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
18061ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1807dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1808f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1809f1af5d2fSBarry Smith }
1810f1af5d2fSBarry Smith 
18114a2ae208SSatish Balay #undef __FUNCT__
18124dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
18134dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
181432121132SShri Abhyankar {
181532121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
181632121132SShri Abhyankar   PetscErrorCode    ierr;
181732121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1818b3260449SShri Abhyankar   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
181932121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
182032121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1821b3260449SShri Abhyankar   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1822b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1823b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1824b3260449SShri Abhyankar   const PetscScalar *b;
182532121132SShri Abhyankar 
182632121132SShri Abhyankar   PetscFunctionBegin;
18273649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
182832121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
182932121132SShri Abhyankar   t    = a->solve_work;
183032121132SShri Abhyankar 
183132121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
183232121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
183332121132SShri Abhyankar 
183432121132SShri Abhyankar   /* copy b into temp work space according to permutation */
183532121132SShri Abhyankar   for (i=0; i<n; i++) {
183632121132SShri Abhyankar     ii      = bs*i; ic = bs*c[i];
183732121132SShri Abhyankar     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
183832121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
183932121132SShri Abhyankar   }
184032121132SShri Abhyankar 
184132121132SShri Abhyankar   /* forward solve the U^T */
184232121132SShri Abhyankar   idx = 0;
184332121132SShri Abhyankar   for (i=0; i<n; i++) {
184432121132SShri Abhyankar     v = aa + bs2*diag[i];
184532121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
184632121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
184732121132SShri Abhyankar     x6 = t[5+idx];
184832121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
184932121132SShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
185032121132SShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
185132121132SShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
185232121132SShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
185332121132SShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
185432121132SShri Abhyankar     v -= bs2;
185532121132SShri Abhyankar 
185632121132SShri Abhyankar     vi = aj + diag[i] - 1;
185732121132SShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
185832121132SShri Abhyankar     for (j=0; j>-nz; j--) {
185932121132SShri Abhyankar       oidx       = bs*vi[j];
186032121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
186132121132SShri Abhyankar       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
186232121132SShri Abhyankar       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
186332121132SShri Abhyankar       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
186432121132SShri Abhyankar       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
186532121132SShri Abhyankar       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
186632121132SShri Abhyankar       v         -= bs2;
186732121132SShri Abhyankar     }
186832121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
186932121132SShri Abhyankar     t[5+idx] = s6;
187032121132SShri Abhyankar     idx     += bs;
187132121132SShri Abhyankar   }
187232121132SShri Abhyankar   /* backward solve the L^T */
187332121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
187432121132SShri Abhyankar     v   = aa + bs2*ai[i];
187532121132SShri Abhyankar     vi  = aj + ai[i];
187632121132SShri Abhyankar     nz  = ai[i+1] - ai[i];
187732121132SShri Abhyankar     idt = bs*i;
187832121132SShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
187932121132SShri Abhyankar     s6  = t[5+idt];
188032121132SShri Abhyankar     for (j=0; j<nz; j++) {
188132121132SShri Abhyankar       idx       = bs*vi[j];
188232121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
188332121132SShri Abhyankar       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
188432121132SShri Abhyankar       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
188532121132SShri Abhyankar       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
188632121132SShri Abhyankar       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
188732121132SShri Abhyankar       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
188832121132SShri Abhyankar       v        += bs2;
188932121132SShri Abhyankar     }
189032121132SShri Abhyankar   }
189132121132SShri Abhyankar 
189232121132SShri Abhyankar   /* copy t into x according to permutation */
189332121132SShri Abhyankar   for (i=0; i<n; i++) {
189432121132SShri Abhyankar     ii      = bs*i;  ir = bs*r[i];
189532121132SShri Abhyankar     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
189632121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
189732121132SShri Abhyankar   }
189832121132SShri Abhyankar 
189932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
190032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
19013649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
190232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
190332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
190432121132SShri Abhyankar   PetscFunctionReturn(0);
190532121132SShri Abhyankar }
190632121132SShri Abhyankar 
190732121132SShri Abhyankar #undef __FUNCT__
190806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
190906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1910f1af5d2fSBarry Smith {
1911f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1912f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
19136849ba73SBarry Smith   PetscErrorCode    ierr;
19145d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1915b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1916b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1917b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1918b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1919b3260449SShri Abhyankar   const PetscScalar *b;
1920f1af5d2fSBarry Smith 
1921f1af5d2fSBarry Smith   PetscFunctionBegin;
19223649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
19231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1924f1af5d2fSBarry Smith   t    = a->solve_work;
1925f1af5d2fSBarry Smith 
1926f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1927f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1928f1af5d2fSBarry Smith 
1929f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1930f1af5d2fSBarry Smith   ii = 0;
1931f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1932f1af5d2fSBarry Smith     ic      = 7*c[i];
1933f1af5d2fSBarry Smith     t[ii]   = b[ic];
1934f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1935f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1936f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1937f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1938f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1939f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1940f1af5d2fSBarry Smith     ii     += 7;
1941f1af5d2fSBarry Smith   }
1942f1af5d2fSBarry Smith 
1943f1af5d2fSBarry Smith   /* forward solve the U^T */
1944f1af5d2fSBarry Smith   idx = 0;
1945f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1946f1af5d2fSBarry Smith 
1947f1af5d2fSBarry Smith     v = aa + 49*diag[i];
1948f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1949f1af5d2fSBarry Smith     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1950f1af5d2fSBarry Smith     x6 = t[5+idx]; x7 = t[6+idx];
1951f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1952f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1953f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1954f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1955f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1956f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1957f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1958f1af5d2fSBarry Smith     v += 49;
1959f1af5d2fSBarry Smith 
1960f1af5d2fSBarry Smith     vi = aj + diag[i] + 1;
1961f1af5d2fSBarry Smith     nz = ai[i+1] - diag[i] - 1;
1962f1af5d2fSBarry Smith     while (nz--) {
1963f1af5d2fSBarry Smith       oidx       = 7*(*vi++);
1964f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1965f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1966f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1967f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1968f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1969f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1970f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1971f1af5d2fSBarry Smith       v         += 49;
1972f1af5d2fSBarry Smith     }
1973f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1974f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1975f1af5d2fSBarry Smith     idx     += 7;
1976f1af5d2fSBarry Smith   }
1977f1af5d2fSBarry Smith   /* backward solve the L^T */
1978f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1979f1af5d2fSBarry Smith     v   = aa + 49*diag[i] - 49;
1980f1af5d2fSBarry Smith     vi  = aj + diag[i] - 1;
1981f1af5d2fSBarry Smith     nz  = diag[i] - ai[i];
1982f1af5d2fSBarry Smith     idt = 7*i;
1983f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1984f1af5d2fSBarry Smith     s6  = t[5+idt];s7 = t[6+idt];
1985f1af5d2fSBarry Smith     while (nz--) {
1986f1af5d2fSBarry Smith       idx       = 7*(*vi--);
1987f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1988f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1989f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1990f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1991f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1992f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1993f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1994f1af5d2fSBarry Smith       v        -= 49;
1995f1af5d2fSBarry Smith     }
1996f1af5d2fSBarry Smith   }
1997f1af5d2fSBarry Smith 
1998f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1999f1af5d2fSBarry Smith   ii = 0;
2000f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
2001f1af5d2fSBarry Smith     ir      = 7*r[i];
2002f1af5d2fSBarry Smith     x[ir]   = t[ii];
2003f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
2004f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
2005f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
2006f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
2007f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
2008f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
2009f1af5d2fSBarry Smith     ii     += 7;
2010f1af5d2fSBarry Smith   }
2011f1af5d2fSBarry Smith 
2012f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2013f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20143649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
20151ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2016dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2017f1af5d2fSBarry Smith   PetscFunctionReturn(0);
2018f1af5d2fSBarry Smith }
201932121132SShri Abhyankar #undef __FUNCT__
20204dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
20214dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
202232121132SShri Abhyankar {
202332121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
202432121132SShri Abhyankar   PetscErrorCode    ierr;
202532121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
2026b3260449SShri Abhyankar   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
202732121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
202832121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2029b3260449SShri Abhyankar   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
2030b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2031b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2032b3260449SShri Abhyankar   const PetscScalar *b;
203332121132SShri Abhyankar 
203432121132SShri Abhyankar   PetscFunctionBegin;
20353649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
203632121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
203732121132SShri Abhyankar   t    = a->solve_work;
203832121132SShri Abhyankar 
203932121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
204032121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
204132121132SShri Abhyankar 
204232121132SShri Abhyankar   /* copy b into temp work space according to permutation */
204332121132SShri Abhyankar   for (i=0; i<n; i++) {
204432121132SShri Abhyankar     ii      = bs*i; ic = bs*c[i];
204532121132SShri Abhyankar     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
204632121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
204732121132SShri Abhyankar   }
204832121132SShri Abhyankar 
204932121132SShri Abhyankar   /* forward solve the U^T */
205032121132SShri Abhyankar   idx = 0;
205132121132SShri Abhyankar   for (i=0; i<n; i++) {
205232121132SShri Abhyankar     v = aa + bs2*diag[i];
205332121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
205432121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
205532121132SShri Abhyankar     x6 = t[5+idx]; x7 = t[6+idx];
205632121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
205732121132SShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
205832121132SShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
205932121132SShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
206032121132SShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
206132121132SShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
206232121132SShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
206332121132SShri Abhyankar     v -= bs2;
206432121132SShri Abhyankar 
206532121132SShri Abhyankar     vi = aj + diag[i] - 1;
206632121132SShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
206732121132SShri Abhyankar     for (j=0; j>-nz; j--) {
206832121132SShri Abhyankar       oidx       = bs*vi[j];
206932121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
207032121132SShri Abhyankar       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
207132121132SShri Abhyankar       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
207232121132SShri Abhyankar       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
207332121132SShri Abhyankar       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
207432121132SShri Abhyankar       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
207532121132SShri Abhyankar       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
207632121132SShri Abhyankar       v         -= bs2;
207732121132SShri Abhyankar     }
207832121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
207932121132SShri Abhyankar     t[5+idx] = s6;  t[6+idx] = s7;
208032121132SShri Abhyankar     idx     += bs;
208132121132SShri Abhyankar   }
208232121132SShri Abhyankar   /* backward solve the L^T */
208332121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
208432121132SShri Abhyankar     v   = aa + bs2*ai[i];
208532121132SShri Abhyankar     vi  = aj + ai[i];
208632121132SShri Abhyankar     nz  = ai[i+1] - ai[i];
208732121132SShri Abhyankar     idt = bs*i;
208832121132SShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
208932121132SShri Abhyankar     s6  = t[5+idt];  s7 = t[6+idt];
209032121132SShri Abhyankar     for (j=0; j<nz; j++) {
209132121132SShri Abhyankar       idx       = bs*vi[j];
209232121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
209332121132SShri Abhyankar       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
209432121132SShri Abhyankar       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
209532121132SShri Abhyankar       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
209632121132SShri Abhyankar       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
209732121132SShri Abhyankar       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
209832121132SShri Abhyankar       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
209932121132SShri Abhyankar       v        += bs2;
210032121132SShri Abhyankar     }
210132121132SShri Abhyankar   }
210232121132SShri Abhyankar 
210332121132SShri Abhyankar   /* copy t into x according to permutation */
210432121132SShri Abhyankar   for (i=0; i<n; i++) {
210532121132SShri Abhyankar     ii      = bs*i;  ir = bs*r[i];
210632121132SShri Abhyankar     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
210732121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
210832121132SShri Abhyankar   }
210932121132SShri Abhyankar 
211032121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
211132121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21123649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
211332121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
211432121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
211532121132SShri Abhyankar   PetscFunctionReturn(0);
211632121132SShri Abhyankar }
2117f1af5d2fSBarry Smith 
21184e2b4712SSatish Balay /* ----------------------------------------------------------- */
21194a2ae208SSatish Balay #undef __FUNCT__
212006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
212106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21224e2b4712SSatish Balay {
21234e2b4712SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
21244e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
21256849ba73SBarry Smith   PetscErrorCode    ierr;
2126b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2127b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2128b3260449SShri Abhyankar   PetscInt          i,nz;
2129b3260449SShri Abhyankar   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
2130b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2131b3260449SShri Abhyankar   PetscScalar       *x,*s,*t,*ls;
2132b3260449SShri Abhyankar   const PetscScalar *b;
21334e2b4712SSatish Balay 
21344e2b4712SSatish Balay   PetscFunctionBegin;
21353649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21361ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2137f1af5d2fSBarry Smith   t    = a->solve_work;
21384e2b4712SSatish Balay 
21394e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21404e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
21414e2b4712SSatish Balay 
21424e2b4712SSatish Balay   /* forward solve the lower triangular */
214387828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21444e2b4712SSatish Balay   for (i=1; i<n; i++) {
21454e2b4712SSatish Balay     v    = aa + bs2*ai[i];
21464e2b4712SSatish Balay     vi   = aj + ai[i];
21474e2b4712SSatish Balay     nz   = a->diag[i] - ai[i];
2148f1af5d2fSBarry Smith     s    = t + bs*i;
214987828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21504e2b4712SSatish Balay     while (nz--) {
215196b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
21524e2b4712SSatish Balay       v += bs2;
21534e2b4712SSatish Balay     }
21544e2b4712SSatish Balay   }
21554e2b4712SSatish Balay   /* backward solve the upper triangular */
2156d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
21574e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
21584e2b4712SSatish Balay     v    = aa + bs2*(a->diag[i] + 1);
21594e2b4712SSatish Balay     vi   = aj + a->diag[i] + 1;
21604e2b4712SSatish Balay     nz   = ai[i+1] - a->diag[i] - 1;
216187828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21624e2b4712SSatish Balay     while (nz--) {
216396b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
21644e2b4712SSatish Balay       v += bs2;
21654e2b4712SSatish Balay     }
216696b95a6bSBarry Smith     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
216787828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21684e2b4712SSatish Balay   }
21694e2b4712SSatish Balay 
21704e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21714e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21723649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
21731ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2174dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
21754e2b4712SSatish Balay   PetscFunctionReturn(0);
21764e2b4712SSatish Balay }
21774e2b4712SSatish Balay 
21785c42ef9dSBarry Smith /* ----------------------------------------------------------- */
21795c42ef9dSBarry Smith #undef __FUNCT__
218006e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
218106e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21825c42ef9dSBarry Smith {
21835c42ef9dSBarry Smith   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
21845c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
21855c42ef9dSBarry Smith   PetscErrorCode    ierr;
21865c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2187b3260449SShri Abhyankar   PetscInt          i,nz,j;
2188b3260449SShri Abhyankar   const PetscInt    n  =a->mbs,bs=A->rmap->bs,bs2=a->bs2;
21895c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
21905c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
21915c42ef9dSBarry Smith   const PetscScalar *b;
21926e111a19SKarl Rupp 
21935c42ef9dSBarry Smith   PetscFunctionBegin;
21943649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21955c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21965c42ef9dSBarry Smith   t    = a->solve_work;
21975c42ef9dSBarry Smith 
21985c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21995c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22005c42ef9dSBarry Smith 
22015c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
22025c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22035c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22045c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
22055c42ef9dSBarry Smith     }
22065c42ef9dSBarry Smith   }
22075c42ef9dSBarry Smith 
22085c42ef9dSBarry Smith 
22095c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
22105c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
22115c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22125c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
221396b95a6bSBarry Smith     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
22145c42ef9dSBarry Smith     v  = aa + bs2*(a->diag[i] + 1);
22155c42ef9dSBarry Smith     vi = aj + a->diag[i] + 1;
22165c42ef9dSBarry Smith     nz = ai[i+1] - a->diag[i] - 1;
22175c42ef9dSBarry Smith     while (nz--) {
221896b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22195c42ef9dSBarry Smith       v += bs2;
22205c42ef9dSBarry Smith     }
22215c42ef9dSBarry Smith   }
22225c42ef9dSBarry Smith 
22235c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
22245c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
22255c42ef9dSBarry Smith     v  = aa + bs2*ai[i];
22265c42ef9dSBarry Smith     vi = aj + ai[i];
22275c42ef9dSBarry Smith     nz = a->diag[i] - ai[i];
22285c42ef9dSBarry Smith     while (nz--) {
222996b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22305c42ef9dSBarry Smith       v += bs2;
22315c42ef9dSBarry Smith     }
22325c42ef9dSBarry Smith   }
22335c42ef9dSBarry Smith 
22345c42ef9dSBarry Smith   /* copy t into x according to permutation */
22355c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22365c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22375c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
22385c42ef9dSBarry Smith     }
22395c42ef9dSBarry Smith   }
22405c42ef9dSBarry Smith 
22415c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22425c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22433649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
22445c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22455c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
22465c42ef9dSBarry Smith   PetscFunctionReturn(0);
22475c42ef9dSBarry Smith }
22485c42ef9dSBarry Smith 
22494a2ae208SSatish Balay #undef __FUNCT__
22504dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
22514dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
22528499736aSShri Abhyankar {
22538499736aSShri Abhyankar   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
22548499736aSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
22558499736aSShri Abhyankar   PetscErrorCode    ierr;
2256b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2257b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2258b3260449SShri Abhyankar   PetscInt          i,j,nz;
2259b3260449SShri Abhyankar   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
22608499736aSShri Abhyankar   const MatScalar   *aa=a->a,*v;
22618499736aSShri Abhyankar   PetscScalar       *x,*t,*ls;
22628499736aSShri Abhyankar   const PetscScalar *b;
2263b3260449SShri Abhyankar 
22648499736aSShri Abhyankar   PetscFunctionBegin;
22653649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
22668499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22678499736aSShri Abhyankar   t    = a->solve_work;
22688499736aSShri Abhyankar 
22698499736aSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22708499736aSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22718499736aSShri Abhyankar 
22728499736aSShri Abhyankar   /* copy the b into temp work space according to permutation */
22738499736aSShri Abhyankar   for (i=0; i<n; i++) {
22748499736aSShri Abhyankar     for (j=0; j<bs; j++) {
22758499736aSShri Abhyankar       t[i*bs+j] = b[c[i]*bs+j];
22768499736aSShri Abhyankar     }
22778499736aSShri Abhyankar   }
22788499736aSShri Abhyankar 
22798499736aSShri Abhyankar 
22808499736aSShri Abhyankar   /* forward solve the upper triangular transpose */
22818499736aSShri Abhyankar   ls = a->solve_work + A->cmap->n;
22828499736aSShri Abhyankar   for (i=0; i<n; i++) {
22838499736aSShri Abhyankar     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
228496b95a6bSBarry Smith     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
22858499736aSShri Abhyankar     v  = aa + bs2*(diag[i] - 1);
22868499736aSShri Abhyankar     vi = aj + diag[i] - 1;
22878499736aSShri Abhyankar     nz = diag[i] - diag[i+1] - 1;
22888499736aSShri Abhyankar     for (j=0; j>-nz; j--) {
228996b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
22908499736aSShri Abhyankar       v -= bs2;
22918499736aSShri Abhyankar     }
22928499736aSShri Abhyankar   }
22938499736aSShri Abhyankar 
22948499736aSShri Abhyankar   /* backward solve the lower triangular transpose */
22958499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
22968499736aSShri Abhyankar     v  = aa + bs2*ai[i];
22978499736aSShri Abhyankar     vi = aj + ai[i];
22988499736aSShri Abhyankar     nz = ai[i+1] - ai[i];
22998499736aSShri Abhyankar     for (j=0; j<nz; j++) {
230096b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
23018499736aSShri Abhyankar       v += bs2;
23028499736aSShri Abhyankar     }
23038499736aSShri Abhyankar   }
23048499736aSShri Abhyankar 
23058499736aSShri Abhyankar   /* copy t into x according to permutation */
23068499736aSShri Abhyankar   for (i=0; i<n; i++) {
23078499736aSShri Abhyankar     for (j=0; j<bs; j++) {
23088499736aSShri Abhyankar       x[bs*r[i]+j]   = t[bs*i+j];
23098499736aSShri Abhyankar     }
23108499736aSShri Abhyankar   }
23118499736aSShri Abhyankar 
23128499736aSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23138499736aSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23143649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
23158499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
23168499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
23178499736aSShri Abhyankar   PetscFunctionReturn(0);
23188499736aSShri Abhyankar }
23198499736aSShri Abhyankar 
2320832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
232129a97285SShri Abhyankar 
23222b0b2ea7SShri Abhyankar #undef __FUNCT__
2323832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2324832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
23252b0b2ea7SShri Abhyankar {
23262b0b2ea7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
23272b0b2ea7SShri Abhyankar   PetscErrorCode    ierr;
2328b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
23290fa040f9SShri Abhyankar   PetscInt          i,nz,idx,idt,m;
23300b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
23312b0b2ea7SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
23322b0b2ea7SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
23330fa040f9SShri Abhyankar   PetscScalar       *x;
23340b68f018SBarry Smith   const PetscScalar *b;
23352b0b2ea7SShri Abhyankar 
23362b0b2ea7SShri Abhyankar   PetscFunctionBegin;
23373649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
23382b0b2ea7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23392b0b2ea7SShri Abhyankar 
23402b0b2ea7SShri Abhyankar   /* forward solve the lower triangular */
234129a97285SShri Abhyankar   idx   = 0;
23420fa040f9SShri Abhyankar   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
23430fa040f9SShri Abhyankar   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
23440fa040f9SShri Abhyankar   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
23452b0b2ea7SShri Abhyankar 
23462b0b2ea7SShri Abhyankar   for (i=1; i<n; i++) {
23472b0b2ea7SShri Abhyankar     v   = aa + bs2*ai[i];
23482b0b2ea7SShri Abhyankar     vi  = aj + ai[i];
23492b0b2ea7SShri Abhyankar     nz  = ai[i+1] - ai[i];
23500fa040f9SShri Abhyankar     idt = bs*i;
23510fa040f9SShri Abhyankar     s1  = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
23520fa040f9SShri Abhyankar     s6  = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
23530fa040f9SShri Abhyankar     s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
23542b0b2ea7SShri Abhyankar     for (m=0; m<nz; m++) {
23552b0b2ea7SShri Abhyankar       idx = bs*vi[m];
23560fa040f9SShri Abhyankar       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
23570fa040f9SShri Abhyankar       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
23580fa040f9SShri Abhyankar       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
23592b0b2ea7SShri Abhyankar 
23600b8f6341SShri Abhyankar 
23612b0b2ea7SShri Abhyankar       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
23622b0b2ea7SShri Abhyankar       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
23632b0b2ea7SShri Abhyankar       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
23642b0b2ea7SShri Abhyankar       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
23652b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
23662b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
23672b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
23682b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
23692b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
23702b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
23712b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
23722b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
23732b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
23742b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
23752b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
23762b0b2ea7SShri Abhyankar 
23772b0b2ea7SShri Abhyankar       v += bs2;
23782b0b2ea7SShri Abhyankar     }
23790fa040f9SShri Abhyankar     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
23800fa040f9SShri Abhyankar     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
23810fa040f9SShri Abhyankar     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
23822b0b2ea7SShri Abhyankar 
23832b0b2ea7SShri Abhyankar   }
23842b0b2ea7SShri Abhyankar   /* backward solve the upper triangular */
23852b0b2ea7SShri Abhyankar   for (i=n-1; i>=0; i--) {
23862b0b2ea7SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
23872b0b2ea7SShri Abhyankar     vi  = aj + adiag[i+1]+1;
23882b0b2ea7SShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
23892b0b2ea7SShri Abhyankar     idt = bs*i;
23900fa040f9SShri Abhyankar     s1  = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
23910fa040f9SShri Abhyankar     s6  = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
23920fa040f9SShri Abhyankar     s11 = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
23932b0b2ea7SShri Abhyankar 
23942b0b2ea7SShri Abhyankar     for (m=0; m<nz; m++) {
23952b0b2ea7SShri Abhyankar       idx = bs*vi[m];
23960fa040f9SShri Abhyankar       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
23970fa040f9SShri Abhyankar       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
23980fa040f9SShri Abhyankar       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
23992b0b2ea7SShri Abhyankar 
24002b0b2ea7SShri Abhyankar       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
24012b0b2ea7SShri Abhyankar       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
24022b0b2ea7SShri Abhyankar       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
24032b0b2ea7SShri Abhyankar       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
24042b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
24052b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
24062b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
24072b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
24082b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
24092b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
24102b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
24112b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
24122b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
24132b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
24142b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
24152b0b2ea7SShri Abhyankar 
24162b0b2ea7SShri Abhyankar       v += bs2;
24172b0b2ea7SShri Abhyankar     }
24182b0b2ea7SShri Abhyankar 
24190fa040f9SShri Abhyankar     x[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
24200fa040f9SShri Abhyankar     x[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
24210fa040f9SShri Abhyankar     x[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
24220fa040f9SShri Abhyankar     x[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
24230fa040f9SShri Abhyankar     x[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
24240fa040f9SShri Abhyankar     x[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
24250fa040f9SShri Abhyankar     x[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
24260fa040f9SShri Abhyankar     x[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
24270fa040f9SShri Abhyankar     x[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
24280fa040f9SShri Abhyankar     x[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
24290fa040f9SShri Abhyankar     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
24300fa040f9SShri Abhyankar     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
24310fa040f9SShri Abhyankar     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
24320fa040f9SShri Abhyankar     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
24330fa040f9SShri Abhyankar     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
24342b0b2ea7SShri Abhyankar 
24352b0b2ea7SShri Abhyankar   }
24362b0b2ea7SShri Abhyankar 
24373649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
24382b0b2ea7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24392b0b2ea7SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
24402b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
24412b0b2ea7SShri Abhyankar }
24422b0b2ea7SShri Abhyankar 
2443832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2444832cc040SShri Abhyankar /* Default MatSolve for block size 15 */
2445832cc040SShri Abhyankar 
24468499736aSShri Abhyankar #undef __FUNCT__
2447832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2448832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
24490b8f6341SShri Abhyankar {
24500b8f6341SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
24510b8f6341SShri Abhyankar   PetscErrorCode    ierr;
24520b8f6341SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
245353ef36baSBarry Smith   PetscInt          i,k,nz,idx,idt,m;
24540b8f6341SShri Abhyankar   const MatScalar   *aa=a->a,*v;
24550b8f6341SShri Abhyankar   PetscScalar       s[15];
245653ef36baSBarry Smith   PetscScalar       *x,xv;
24570b8f6341SShri Abhyankar   const PetscScalar *b;
24580b8f6341SShri Abhyankar 
24590b8f6341SShri Abhyankar   PetscFunctionBegin;
24603649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
24610b8f6341SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24620b8f6341SShri Abhyankar 
24630b8f6341SShri Abhyankar   /* forward solve the lower triangular */
2464832cc040SShri Abhyankar   for (i=0; i<n; i++) {
24650b8f6341SShri Abhyankar     v         = aa + bs2*ai[i];
24660b8f6341SShri Abhyankar     vi        = aj + ai[i];
24670b8f6341SShri Abhyankar     nz        = ai[i+1] - ai[i];
24680fa040f9SShri Abhyankar     idt       = bs*i;
2469832cc040SShri Abhyankar     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2470832cc040SShri Abhyankar     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2471832cc040SShri Abhyankar     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
24720b8f6341SShri Abhyankar     for (m=0; m<nz; m++) {
24730b8f6341SShri Abhyankar       idx = bs*vi[m];
24740b8f6341SShri Abhyankar       for (k=0; k<15; k++) {
247553ef36baSBarry Smith         xv         = x[k + idx];
247653ef36baSBarry Smith         x[idt]    -= v[0]*xv;
247753ef36baSBarry Smith         x[1+idt]  -= v[1]*xv;
247853ef36baSBarry Smith         x[2+idt]  -= v[2]*xv;
247953ef36baSBarry Smith         x[3+idt]  -= v[3]*xv;
248053ef36baSBarry Smith         x[4+idt]  -= v[4]*xv;
248153ef36baSBarry Smith         x[5+idt]  -= v[5]*xv;
248253ef36baSBarry Smith         x[6+idt]  -= v[6]*xv;
248353ef36baSBarry Smith         x[7+idt]  -= v[7]*xv;
248453ef36baSBarry Smith         x[8+idt]  -= v[8]*xv;
248553ef36baSBarry Smith         x[9+idt]  -= v[9]*xv;
248653ef36baSBarry Smith         x[10+idt] -= v[10]*xv;
248753ef36baSBarry Smith         x[11+idt] -= v[11]*xv;
248853ef36baSBarry Smith         x[12+idt] -= v[12]*xv;
248953ef36baSBarry Smith         x[13+idt] -= v[13]*xv;
249053ef36baSBarry Smith         x[14+idt] -= v[14]*xv;
24910b8f6341SShri Abhyankar         v         += 15;
24920b8f6341SShri Abhyankar       }
24930b8f6341SShri Abhyankar     }
24940b8f6341SShri Abhyankar   }
24950b8f6341SShri Abhyankar   /* backward solve the upper triangular */
24960b8f6341SShri Abhyankar   for (i=n-1; i>=0; i--) {
24970b8f6341SShri Abhyankar     v     = aa + bs2*(adiag[i+1]+1);
24980b8f6341SShri Abhyankar     vi    = aj + adiag[i+1]+1;
24990b8f6341SShri Abhyankar     nz    = adiag[i] - adiag[i+1] - 1;
25000b8f6341SShri Abhyankar     idt   = bs*i;
25010fa040f9SShri Abhyankar     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
25020fa040f9SShri Abhyankar     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
25030fa040f9SShri Abhyankar     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
25040b8f6341SShri Abhyankar 
25050b8f6341SShri Abhyankar     for (m=0; m<nz; m++) {
25060b8f6341SShri Abhyankar       idx = bs*vi[m];
25070b8f6341SShri Abhyankar       for (k=0; k<15; k++) {
250853ef36baSBarry Smith         xv     = x[k + idx];
250953ef36baSBarry Smith         s[0]  -= v[0]*xv;
251053ef36baSBarry Smith         s[1]  -= v[1]*xv;
251153ef36baSBarry Smith         s[2]  -= v[2]*xv;
251253ef36baSBarry Smith         s[3]  -= v[3]*xv;
251353ef36baSBarry Smith         s[4]  -= v[4]*xv;
251453ef36baSBarry Smith         s[5]  -= v[5]*xv;
251553ef36baSBarry Smith         s[6]  -= v[6]*xv;
251653ef36baSBarry Smith         s[7]  -= v[7]*xv;
251753ef36baSBarry Smith         s[8]  -= v[8]*xv;
251853ef36baSBarry Smith         s[9]  -= v[9]*xv;
251953ef36baSBarry Smith         s[10] -= v[10]*xv;
252053ef36baSBarry Smith         s[11] -= v[11]*xv;
252153ef36baSBarry Smith         s[12] -= v[12]*xv;
252253ef36baSBarry Smith         s[13] -= v[13]*xv;
252353ef36baSBarry Smith         s[14] -= v[14]*xv;
25240b8f6341SShri Abhyankar         v     += 15;
25250b8f6341SShri Abhyankar       }
25260b8f6341SShri Abhyankar     }
25270fa040f9SShri Abhyankar     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
25280b8f6341SShri Abhyankar     for (k=0; k<15; k++) {
25290fa040f9SShri Abhyankar       x[idt]    += v[0]*s[k];
25300fa040f9SShri Abhyankar       x[1+idt]  += v[1]*s[k];
25310fa040f9SShri Abhyankar       x[2+idt]  += v[2]*s[k];
25320fa040f9SShri Abhyankar       x[3+idt]  += v[3]*s[k];
25330fa040f9SShri Abhyankar       x[4+idt]  += v[4]*s[k];
25340fa040f9SShri Abhyankar       x[5+idt]  += v[5]*s[k];
25350fa040f9SShri Abhyankar       x[6+idt]  += v[6]*s[k];
25360fa040f9SShri Abhyankar       x[7+idt]  += v[7]*s[k];
25370fa040f9SShri Abhyankar       x[8+idt]  += v[8]*s[k];
25380fa040f9SShri Abhyankar       x[9+idt]  += v[9]*s[k];
25390fa040f9SShri Abhyankar       x[10+idt] += v[10]*s[k];
25400fa040f9SShri Abhyankar       x[11+idt] += v[11]*s[k];
25410fa040f9SShri Abhyankar       x[12+idt] += v[12]*s[k];
25420fa040f9SShri Abhyankar       x[13+idt] += v[13]*s[k];
25430fa040f9SShri Abhyankar       x[14+idt] += v[14]*s[k];
25440b8f6341SShri Abhyankar       v         += 15;
25450b8f6341SShri Abhyankar     }
25460b8f6341SShri Abhyankar   }
25473649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
25480b8f6341SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
25490b8f6341SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
25500b8f6341SShri Abhyankar   PetscFunctionReturn(0);
25510b8f6341SShri Abhyankar }
25520b8f6341SShri Abhyankar 
25530b8f6341SShri Abhyankar 
25540b8f6341SShri Abhyankar #undef __FUNCT__
255506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
255606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
25574e2b4712SSatish Balay {
25584e2b4712SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
25594e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
25606849ba73SBarry Smith   PetscErrorCode    ierr;
2561b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2562b3260449SShri Abhyankar   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2563b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2564b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2565b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2566b3260449SShri Abhyankar   const PetscScalar *b;
25674e2b4712SSatish Balay 
25684e2b4712SSatish Balay   PetscFunctionBegin;
25693649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
25701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2571f1af5d2fSBarry Smith   t    = a->solve_work;
25724e2b4712SSatish Balay 
25734e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25744e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
25754e2b4712SSatish Balay 
25764e2b4712SSatish Balay   /* forward solve the lower triangular */
25774e2b4712SSatish Balay   idx  = 7*(*r++);
2578f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2579f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2580f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
25814e2b4712SSatish Balay 
25824e2b4712SSatish Balay   for (i=1; i<n; i++) {
25834e2b4712SSatish Balay     v   = aa + 49*ai[i];
25844e2b4712SSatish Balay     vi  = aj + ai[i];
25854e2b4712SSatish Balay     nz  = diag[i] - ai[i];
25864e2b4712SSatish Balay     idx = 7*(*r++);
2587f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2588f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
25894e2b4712SSatish Balay     while (nz--) {
25904e2b4712SSatish Balay       idx = 7*(*vi++);
2591f1af5d2fSBarry Smith       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2592f1af5d2fSBarry Smith       x4  = t[3+idx];x5 = t[4+idx];
2593f1af5d2fSBarry Smith       x6  = t[5+idx];x7 = t[6+idx];
2594f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2595f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2596f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2597f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2598f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2599f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2600f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26014e2b4712SSatish Balay       v  += 49;
26024e2b4712SSatish Balay     }
26034e2b4712SSatish Balay     idx      = 7*i;
2604f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2605f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2606f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
26074e2b4712SSatish Balay   }
26084e2b4712SSatish Balay   /* backward solve the upper triangular */
26094e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
26104e2b4712SSatish Balay     v   = aa + 49*diag[i] + 49;
26114e2b4712SSatish Balay     vi  = aj + diag[i] + 1;
26124e2b4712SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
26134e2b4712SSatish Balay     idt = 7*i;
2614f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt];
2615f1af5d2fSBarry Smith     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2616f1af5d2fSBarry Smith     s6  = t[5+idt];s7 = t[6+idt];
26174e2b4712SSatish Balay     while (nz--) {
26184e2b4712SSatish Balay       idx = 7*(*vi++);
2619f1af5d2fSBarry Smith       x1  = t[idx];   x2 = t[1+idx];
2620f1af5d2fSBarry Smith       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2621f1af5d2fSBarry Smith       x6  = t[5+idx]; x7 = t[6+idx];
2622f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2623f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2624f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2625f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2626f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2627f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2628f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26294e2b4712SSatish Balay       v  += 49;
26304e2b4712SSatish Balay     }
26314e2b4712SSatish Balay     idc    = 7*(*c--);
26324e2b4712SSatish Balay     v      = aa + 49*diag[i];
2633f1af5d2fSBarry Smith     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2634f1af5d2fSBarry Smith                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2635f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2636f1af5d2fSBarry Smith                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2637f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2638f1af5d2fSBarry Smith                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2639f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2640f1af5d2fSBarry Smith                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2641f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2642f1af5d2fSBarry Smith                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2643f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2644f1af5d2fSBarry Smith                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2645f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2646f1af5d2fSBarry Smith                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
26474e2b4712SSatish Balay   }
26484e2b4712SSatish Balay 
26494e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
26504e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
26513649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
26521ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2653dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
26544e2b4712SSatish Balay   PetscFunctionReturn(0);
26554e2b4712SSatish Balay }
26564e2b4712SSatish Balay 
26578f690400SShri Abhyankar #undef __FUNCT__
26584dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7"
26594dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
266035aa4fcfSShri Abhyankar {
266135aa4fcfSShri Abhyankar   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
266235aa4fcfSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
266335aa4fcfSShri Abhyankar   PetscErrorCode    ierr;
2664b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2665b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2666b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
2667b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2668b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2669b3260449SShri Abhyankar   const PetscScalar *b;
267035aa4fcfSShri Abhyankar 
267135aa4fcfSShri Abhyankar   PetscFunctionBegin;
26723649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
267335aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
267435aa4fcfSShri Abhyankar   t    = a->solve_work;
267535aa4fcfSShri Abhyankar 
267635aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
267735aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
267835aa4fcfSShri Abhyankar 
267935aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
268035aa4fcfSShri Abhyankar   idx  = 7*r[0];
268135aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
268235aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
268335aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
268435aa4fcfSShri Abhyankar 
268535aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
268635aa4fcfSShri Abhyankar     v   = aa + 49*ai[i];
268735aa4fcfSShri Abhyankar     vi  = aj + ai[i];
268835aa4fcfSShri Abhyankar     nz  = ai[i+1] - ai[i];
268935aa4fcfSShri Abhyankar     idx = 7*r[i];
269035aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
269135aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
269235aa4fcfSShri Abhyankar     for (m=0; m<nz; m++) {
269335aa4fcfSShri Abhyankar       idx = 7*vi[m];
269435aa4fcfSShri Abhyankar       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
269535aa4fcfSShri Abhyankar       x4  = t[3+idx];x5 = t[4+idx];
269635aa4fcfSShri Abhyankar       x6  = t[5+idx];x7 = t[6+idx];
269735aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
269835aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
269935aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
270035aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
270135aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
270235aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
270335aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
270435aa4fcfSShri Abhyankar       v  += 49;
270535aa4fcfSShri Abhyankar     }
270635aa4fcfSShri Abhyankar     idx      = 7*i;
270735aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
270835aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
270935aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
271035aa4fcfSShri Abhyankar   }
271135aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
271235aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--) {
271335aa4fcfSShri Abhyankar     v   = aa + 49*(adiag[i+1]+1);
271435aa4fcfSShri Abhyankar     vi  = aj + adiag[i+1]+1;
271535aa4fcfSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
271635aa4fcfSShri Abhyankar     idt = 7*i;
271735aa4fcfSShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];
271835aa4fcfSShri Abhyankar     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
271935aa4fcfSShri Abhyankar     s6  = t[5+idt];s7 = t[6+idt];
272035aa4fcfSShri Abhyankar     for (m=0; m<nz; m++) {
272135aa4fcfSShri Abhyankar       idx = 7*vi[m];
272235aa4fcfSShri Abhyankar       x1  = t[idx];   x2 = t[1+idx];
272335aa4fcfSShri Abhyankar       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
272435aa4fcfSShri Abhyankar       x6  = t[5+idx]; x7 = t[6+idx];
272535aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
272635aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
272735aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
272835aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
272935aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
273035aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
273135aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
273235aa4fcfSShri Abhyankar       v  += 49;
273335aa4fcfSShri Abhyankar     }
273435aa4fcfSShri Abhyankar     idc    = 7*c[i];
273535aa4fcfSShri Abhyankar     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
273635aa4fcfSShri Abhyankar                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
273735aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
273835aa4fcfSShri Abhyankar                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
273935aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
274035aa4fcfSShri Abhyankar                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
274135aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
274235aa4fcfSShri Abhyankar                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
274335aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
274435aa4fcfSShri Abhyankar                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
274535aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
274635aa4fcfSShri Abhyankar                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
274735aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
274835aa4fcfSShri Abhyankar                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
274935aa4fcfSShri Abhyankar   }
275035aa4fcfSShri Abhyankar 
275135aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
275235aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
27533649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
275435aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
275535aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
275635aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
275735aa4fcfSShri Abhyankar }
275835aa4fcfSShri Abhyankar 
275935aa4fcfSShri Abhyankar #undef __FUNCT__
276006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
276106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
276215091d37SBarry Smith {
276315091d37SBarry Smith   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
2764b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2765dfbe8321SBarry Smith   PetscErrorCode    ierr;
2766b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
2767d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2768d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2769d9fead3dSBarry Smith   const PetscScalar *b;
277015091d37SBarry Smith 
277115091d37SBarry Smith   PetscFunctionBegin;
27723649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
27731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
277415091d37SBarry Smith   /* forward solve the lower triangular */
277515091d37SBarry Smith   idx  = 0;
277615091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
277715091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
277815091d37SBarry Smith   x[6] = b[6+idx];
277915091d37SBarry Smith   for (i=1; i<n; i++) {
278015091d37SBarry Smith     v   =  aa + 49*ai[i];
278115091d37SBarry Smith     vi  =  aj + ai[i];
278215091d37SBarry Smith     nz  =  diag[i] - ai[i];
278315091d37SBarry Smith     idx =  7*i;
2784f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2785f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2786f1af5d2fSBarry Smith     s7  =  b[6+idx];
278715091d37SBarry Smith     while (nz--) {
278815091d37SBarry Smith       jdx = 7*(*vi++);
278915091d37SBarry Smith       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
279015091d37SBarry Smith       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
279115091d37SBarry Smith       x7  = x[6+jdx];
2792f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2793f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2794f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2795f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2796f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2797f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2798f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
279915091d37SBarry Smith       v  += 49;
280015091d37SBarry Smith     }
2801f1af5d2fSBarry Smith     x[idx]   = s1;
2802f1af5d2fSBarry Smith     x[1+idx] = s2;
2803f1af5d2fSBarry Smith     x[2+idx] = s3;
2804f1af5d2fSBarry Smith     x[3+idx] = s4;
2805f1af5d2fSBarry Smith     x[4+idx] = s5;
2806f1af5d2fSBarry Smith     x[5+idx] = s6;
2807f1af5d2fSBarry Smith     x[6+idx] = s7;
280815091d37SBarry Smith   }
280915091d37SBarry Smith   /* backward solve the upper triangular */
281015091d37SBarry Smith   for (i=n-1; i>=0; i--) {
281115091d37SBarry Smith     v   = aa + 49*diag[i] + 49;
281215091d37SBarry Smith     vi  = aj + diag[i] + 1;
281315091d37SBarry Smith     nz  = ai[i+1] - diag[i] - 1;
281415091d37SBarry Smith     idt = 7*i;
2815f1af5d2fSBarry Smith     s1  = x[idt];   s2 = x[1+idt];
2816f1af5d2fSBarry Smith     s3  = x[2+idt]; s4 = x[3+idt];
2817f1af5d2fSBarry Smith     s5  = x[4+idt]; s6 = x[5+idt];
2818f1af5d2fSBarry Smith     s7  = x[6+idt];
281915091d37SBarry Smith     while (nz--) {
282015091d37SBarry Smith       idx = 7*(*vi++);
282115091d37SBarry Smith       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
282215091d37SBarry Smith       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
282315091d37SBarry Smith       x7  = x[6+idx];
2824f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2825f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2826f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2827f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2828f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2829f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2830f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
283115091d37SBarry Smith       v  += 49;
283215091d37SBarry Smith     }
283315091d37SBarry Smith     v      = aa + 49*diag[i];
2834f1af5d2fSBarry Smith     x[idt] = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2835f1af5d2fSBarry Smith              + v[28]*s5 + v[35]*s6 + v[42]*s7;
2836f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2837f1af5d2fSBarry Smith                + v[29]*s5 + v[36]*s6 + v[43]*s7;
2838f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2839f1af5d2fSBarry Smith                + v[30]*s5 + v[37]*s6 + v[44]*s7;
2840f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2841f1af5d2fSBarry Smith                + v[31]*s5 + v[38]*s6 + v[45]*s7;
2842f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2843f1af5d2fSBarry Smith                + v[32]*s5 + v[39]*s6 + v[46]*s7;
2844f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2845f1af5d2fSBarry Smith                + v[33]*s5 + v[40]*s6 + v[47]*s7;
2846f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2847f1af5d2fSBarry Smith                + v[34]*s5 + v[41]*s6 + v[48]*s7;
284815091d37SBarry Smith   }
284915091d37SBarry Smith 
28503649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
28511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2852dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
285315091d37SBarry Smith   PetscFunctionReturn(0);
285415091d37SBarry Smith }
285515091d37SBarry Smith 
2856cee9d6f2SShri Abhyankar #undef __FUNCT__
28574dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
28584dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
285953cca76cSShri Abhyankar {
286053cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2861b3260449SShri Abhyankar   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
286253cca76cSShri Abhyankar   PetscErrorCode    ierr;
2863b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,jdx,idt;
2864b3260449SShri Abhyankar   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
286553cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
286653cca76cSShri Abhyankar   PetscScalar       *x;
286753cca76cSShri Abhyankar   const PetscScalar *b;
286853cca76cSShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
286953cca76cSShri Abhyankar 
287053cca76cSShri Abhyankar   PetscFunctionBegin;
28713649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
287253cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
287353cca76cSShri Abhyankar   /* forward solve the lower triangular */
287453cca76cSShri Abhyankar   idx  = 0;
287553cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
287653cca76cSShri Abhyankar   x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
287753cca76cSShri Abhyankar   for (i=1; i<n; i++) {
287853cca76cSShri Abhyankar     v   = aa + bs2*ai[i];
287953cca76cSShri Abhyankar     vi  = aj + ai[i];
288053cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
288153cca76cSShri Abhyankar     idx = bs*i;
288253cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
288353cca76cSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
288453cca76cSShri Abhyankar     for (k=0; k<nz; k++) {
288553cca76cSShri Abhyankar       jdx = bs*vi[k];
288653cca76cSShri Abhyankar       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
288753cca76cSShri Abhyankar       x5  = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
288853cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
288953cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
289053cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
289153cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
289253cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
289353cca76cSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
289453cca76cSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
289553cca76cSShri Abhyankar       v  +=  bs2;
289653cca76cSShri Abhyankar     }
289753cca76cSShri Abhyankar 
289853cca76cSShri Abhyankar     x[idx]   = s1;
289953cca76cSShri Abhyankar     x[1+idx] = s2;
290053cca76cSShri Abhyankar     x[2+idx] = s3;
290153cca76cSShri Abhyankar     x[3+idx] = s4;
290253cca76cSShri Abhyankar     x[4+idx] = s5;
290353cca76cSShri Abhyankar     x[5+idx] = s6;
290453cca76cSShri Abhyankar     x[6+idx] = s7;
290553cca76cSShri Abhyankar   }
290653cca76cSShri Abhyankar 
290753cca76cSShri Abhyankar   /* backward solve the upper triangular */
290853cca76cSShri Abhyankar   for (i=n-1; i>=0; i--) {
290953cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
291053cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
291153cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
291253cca76cSShri Abhyankar     idt = bs*i;
291353cca76cSShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
291453cca76cSShri Abhyankar     s5  = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
291553cca76cSShri Abhyankar     for (k=0; k<nz; k++) {
291653cca76cSShri Abhyankar       idx = bs*vi[k];
291753cca76cSShri Abhyankar       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
291853cca76cSShri Abhyankar       x5  = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
291953cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
292053cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
292153cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
292253cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
292353cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
292453cca76cSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
292553cca76cSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
292653cca76cSShri Abhyankar       v  +=  bs2;
292753cca76cSShri Abhyankar     }
292853cca76cSShri Abhyankar     /* x = inv_diagonal*x */
292953cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
293053cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
293153cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
293253cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
293353cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
293453cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
293553cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
293653cca76cSShri Abhyankar   }
293753cca76cSShri Abhyankar 
29383649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
293953cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
294053cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
294153cca76cSShri Abhyankar   PetscFunctionReturn(0);
294253cca76cSShri Abhyankar }
294353cca76cSShri Abhyankar 
294453cca76cSShri Abhyankar #undef __FUNCT__
294506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
294606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
294715091d37SBarry Smith {
294815091d37SBarry Smith   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
294915091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
29506849ba73SBarry Smith   PetscErrorCode    ierr;
29515d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
2952b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2953b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2954d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2955d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2956d9fead3dSBarry Smith   const PetscScalar *b;
2957b3260449SShri Abhyankar 
295815091d37SBarry Smith   PetscFunctionBegin;
29593649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
29601ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2961f1af5d2fSBarry Smith   t    = a->solve_work;
296215091d37SBarry Smith 
296315091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
296415091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
296515091d37SBarry Smith 
296615091d37SBarry Smith   /* forward solve the lower triangular */
296715091d37SBarry Smith   idx  = 6*(*r++);
2968f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2969f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
2970f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
297115091d37SBarry Smith   for (i=1; i<n; i++) {
297215091d37SBarry Smith     v   = aa + 36*ai[i];
297315091d37SBarry Smith     vi  = aj + ai[i];
297415091d37SBarry Smith     nz  = diag[i] - ai[i];
297515091d37SBarry Smith     idx = 6*(*r++);
2976f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2977f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
297815091d37SBarry Smith     while (nz--) {
297915091d37SBarry Smith       idx = 6*(*vi++);
2980f1af5d2fSBarry Smith       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2981f1af5d2fSBarry Smith       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2982f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2983f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2984f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2985f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2986f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2987f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
298815091d37SBarry Smith       v  += 36;
298915091d37SBarry Smith     }
299015091d37SBarry Smith     idx      = 6*i;
2991f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2992f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
2993f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
299415091d37SBarry Smith   }
299515091d37SBarry Smith   /* backward solve the upper triangular */
299615091d37SBarry Smith   for (i=n-1; i>=0; i--) {
299715091d37SBarry Smith     v   = aa + 36*diag[i] + 36;
299815091d37SBarry Smith     vi  = aj + diag[i] + 1;
299915091d37SBarry Smith     nz  = ai[i+1] - diag[i] - 1;
300015091d37SBarry Smith     idt = 6*i;
3001f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt];
3002f1af5d2fSBarry Smith     s3  = t[2+idt];s4 = t[3+idt];
3003f1af5d2fSBarry Smith     s5  = t[4+idt];s6 = t[5+idt];
300415091d37SBarry Smith     while (nz--) {
300515091d37SBarry Smith       idx = 6*(*vi++);
3006f1af5d2fSBarry Smith       x1  = t[idx];   x2 = t[1+idx];
3007f1af5d2fSBarry Smith       x3  = t[2+idx]; x4 = t[3+idx];
3008f1af5d2fSBarry Smith       x5  = t[4+idx]; x6 = t[5+idx];
3009f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3010f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3011f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3012f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3013f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3014f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
301515091d37SBarry Smith       v  += 36;
301615091d37SBarry Smith     }
301715091d37SBarry Smith     idc    = 6*(*c--);
301815091d37SBarry Smith     v      = aa + 36*diag[i];
3019f1af5d2fSBarry Smith     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3020f1af5d2fSBarry Smith                         v[18]*s4+v[24]*s5+v[30]*s6;
3021f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3022f1af5d2fSBarry Smith                           v[19]*s4+v[25]*s5+v[31]*s6;
3023f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3024f1af5d2fSBarry Smith                           v[20]*s4+v[26]*s5+v[32]*s6;
3025f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3026f1af5d2fSBarry Smith                           v[21]*s4+v[27]*s5+v[33]*s6;
3027f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3028f1af5d2fSBarry Smith                           v[22]*s4+v[28]*s5+v[34]*s6;
3029f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3030f1af5d2fSBarry Smith                           v[23]*s4+v[29]*s5+v[35]*s6;
303115091d37SBarry Smith   }
303215091d37SBarry Smith 
303315091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
303415091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30353649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
30361ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3037dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
303815091d37SBarry Smith   PetscFunctionReturn(0);
303915091d37SBarry Smith }
304015091d37SBarry Smith 
30416506fda5SShri Abhyankar #undef __FUNCT__
30424dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6"
30434dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
30446506fda5SShri Abhyankar {
30456506fda5SShri Abhyankar   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
30466506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
30476506fda5SShri Abhyankar   PetscErrorCode    ierr;
30486506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3049b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3050b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
30516506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
30526506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
30536506fda5SShri Abhyankar   const PetscScalar *b;
3054b3260449SShri Abhyankar 
30556506fda5SShri Abhyankar   PetscFunctionBegin;
30563649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
30576506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
30586506fda5SShri Abhyankar   t    = a->solve_work;
30596506fda5SShri Abhyankar 
30606506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
30616506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
30626506fda5SShri Abhyankar 
30636506fda5SShri Abhyankar   /* forward solve the lower triangular */
30646506fda5SShri Abhyankar   idx  = 6*r[0];
30656506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
30666506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
30676506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
30686506fda5SShri Abhyankar   for (i=1; i<n; i++) {
30696506fda5SShri Abhyankar     v   = aa + 36*ai[i];
30706506fda5SShri Abhyankar     vi  = aj + ai[i];
30716506fda5SShri Abhyankar     nz  = ai[i+1] - ai[i];
30726506fda5SShri Abhyankar     idx = 6*r[i];
30736506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
30746506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
30756506fda5SShri Abhyankar     for (m=0; m<nz; m++) {
30766506fda5SShri Abhyankar       idx = 6*vi[m];
30776506fda5SShri Abhyankar       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
30786506fda5SShri Abhyankar       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
30796506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
30806506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
30816506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
30826506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
30836506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
30846506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
30856506fda5SShri Abhyankar       v  += 36;
30866506fda5SShri Abhyankar     }
30876506fda5SShri Abhyankar     idx      = 6*i;
30886506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
30896506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
30906506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
30916506fda5SShri Abhyankar   }
30926506fda5SShri Abhyankar   /* backward solve the upper triangular */
30936506fda5SShri Abhyankar   for (i=n-1; i>=0; i--) {
30946506fda5SShri Abhyankar     v   = aa + 36*(adiag[i+1]+1);
30956506fda5SShri Abhyankar     vi  = aj + adiag[i+1]+1;
30966506fda5SShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
30976506fda5SShri Abhyankar     idt = 6*i;
30986506fda5SShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];
30996506fda5SShri Abhyankar     s3  = t[2+idt];s4 = t[3+idt];
31006506fda5SShri Abhyankar     s5  = t[4+idt];s6 = t[5+idt];
31016506fda5SShri Abhyankar     for (m=0; m<nz; m++) {
31026506fda5SShri Abhyankar       idx = 6*vi[m];
31036506fda5SShri Abhyankar       x1  = t[idx];   x2 = t[1+idx];
31046506fda5SShri Abhyankar       x3  = t[2+idx]; x4 = t[3+idx];
31056506fda5SShri Abhyankar       x5  = t[4+idx]; x6 = t[5+idx];
31066506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
31076506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
31086506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
31096506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
31106506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
31116506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
31126506fda5SShri Abhyankar       v  += 36;
31136506fda5SShri Abhyankar     }
31146506fda5SShri Abhyankar     idc    = 6*c[i];
31156506fda5SShri Abhyankar     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
31166506fda5SShri Abhyankar                         v[18]*s4+v[24]*s5+v[30]*s6;
31176506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
31186506fda5SShri Abhyankar                           v[19]*s4+v[25]*s5+v[31]*s6;
31196506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
31206506fda5SShri Abhyankar                           v[20]*s4+v[26]*s5+v[32]*s6;
31216506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
31226506fda5SShri Abhyankar                           v[21]*s4+v[27]*s5+v[33]*s6;
31236506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
31246506fda5SShri Abhyankar                           v[22]*s4+v[28]*s5+v[34]*s6;
31256506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
31266506fda5SShri Abhyankar                           v[23]*s4+v[29]*s5+v[35]*s6;
31276506fda5SShri Abhyankar   }
31286506fda5SShri Abhyankar 
31296506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
31306506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
31313649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
31326506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
31336506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
31346506fda5SShri Abhyankar   PetscFunctionReturn(0);
31356506fda5SShri Abhyankar }
31368f690400SShri Abhyankar 
31378f690400SShri Abhyankar #undef __FUNCT__
313806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
313906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
314015091d37SBarry Smith {
314115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3142b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3143dfbe8321SBarry Smith   PetscErrorCode    ierr;
3144b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3145d9fead3dSBarry Smith   const MatScalar   *aa   =a->a,*v;
3146d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3147d9fead3dSBarry Smith   const PetscScalar *b;
314815091d37SBarry Smith 
314915091d37SBarry Smith   PetscFunctionBegin;
31503649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
31511ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
315215091d37SBarry Smith   /* forward solve the lower triangular */
315315091d37SBarry Smith   idx  = 0;
315415091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
315515091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
315615091d37SBarry Smith   for (i=1; i<n; i++) {
315715091d37SBarry Smith     v   =  aa + 36*ai[i];
315815091d37SBarry Smith     vi  =  aj + ai[i];
315915091d37SBarry Smith     nz  =  diag[i] - ai[i];
316015091d37SBarry Smith     idx =  6*i;
3161f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3162f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
316315091d37SBarry Smith     while (nz--) {
316415091d37SBarry Smith       jdx = 6*(*vi++);
316515091d37SBarry Smith       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
316615091d37SBarry Smith       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3167f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3168f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3169f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3170f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3171f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3172f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
317315091d37SBarry Smith       v  += 36;
317415091d37SBarry Smith     }
3175f1af5d2fSBarry Smith     x[idx]   = s1;
3176f1af5d2fSBarry Smith     x[1+idx] = s2;
3177f1af5d2fSBarry Smith     x[2+idx] = s3;
3178f1af5d2fSBarry Smith     x[3+idx] = s4;
3179f1af5d2fSBarry Smith     x[4+idx] = s5;
3180f1af5d2fSBarry Smith     x[5+idx] = s6;
318115091d37SBarry Smith   }
318215091d37SBarry Smith   /* backward solve the upper triangular */
318315091d37SBarry Smith   for (i=n-1; i>=0; i--) {
318415091d37SBarry Smith     v   = aa + 36*diag[i] + 36;
318515091d37SBarry Smith     vi  = aj + diag[i] + 1;
318615091d37SBarry Smith     nz  = ai[i+1] - diag[i] - 1;
318715091d37SBarry Smith     idt = 6*i;
3188f1af5d2fSBarry Smith     s1  = x[idt];   s2 = x[1+idt];
3189f1af5d2fSBarry Smith     s3  = x[2+idt]; s4 = x[3+idt];
3190f1af5d2fSBarry Smith     s5  = x[4+idt]; s6 = x[5+idt];
319115091d37SBarry Smith     while (nz--) {
319215091d37SBarry Smith       idx = 6*(*vi++);
319315091d37SBarry Smith       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
319415091d37SBarry Smith       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3195f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3196f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3197f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3198f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3199f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3200f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
320115091d37SBarry Smith       v  += 36;
320215091d37SBarry Smith     }
320315091d37SBarry Smith     v        = aa + 36*diag[i];
3204f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3205f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3206f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3207f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3208f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3209f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
321015091d37SBarry Smith   }
321115091d37SBarry Smith 
32123649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
32131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3214dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
321515091d37SBarry Smith   PetscFunctionReturn(0);
321615091d37SBarry Smith }
321715091d37SBarry Smith 
3218cee9d6f2SShri Abhyankar #undef __FUNCT__
32194dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
32204dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
322153cca76cSShri Abhyankar {
322253cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3223b3260449SShri Abhyankar   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
322453cca76cSShri Abhyankar   PetscErrorCode    ierr;
3225b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,jdx,idt;
3226b3260449SShri Abhyankar   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
322753cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
322853cca76cSShri Abhyankar   PetscScalar       *x;
322953cca76cSShri Abhyankar   const PetscScalar *b;
323053cca76cSShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
323153cca76cSShri Abhyankar 
323253cca76cSShri Abhyankar   PetscFunctionBegin;
32333649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
323453cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
323553cca76cSShri Abhyankar   /* forward solve the lower triangular */
323653cca76cSShri Abhyankar   idx  = 0;
323753cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
323853cca76cSShri Abhyankar   x[4] = b[4+idx];x[5] = b[5+idx];
323953cca76cSShri Abhyankar   for (i=1; i<n; i++) {
324053cca76cSShri Abhyankar     v   = aa + bs2*ai[i];
324153cca76cSShri Abhyankar     vi  = aj + ai[i];
324253cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
324353cca76cSShri Abhyankar     idx = bs*i;
324453cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
324553cca76cSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];
324653cca76cSShri Abhyankar     for (k=0; k<nz; k++) {
324753cca76cSShri Abhyankar       jdx = bs*vi[k];
324853cca76cSShri Abhyankar       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
324953cca76cSShri Abhyankar       x5  = x[4+jdx]; x6 = x[5+jdx];
325053cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
325153cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
325253cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
325353cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
325453cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
325553cca76cSShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
325653cca76cSShri Abhyankar       v  +=  bs2;
325753cca76cSShri Abhyankar     }
325853cca76cSShri Abhyankar 
325953cca76cSShri Abhyankar     x[idx]   = s1;
326053cca76cSShri Abhyankar     x[1+idx] = s2;
326153cca76cSShri Abhyankar     x[2+idx] = s3;
326253cca76cSShri Abhyankar     x[3+idx] = s4;
326353cca76cSShri Abhyankar     x[4+idx] = s5;
326453cca76cSShri Abhyankar     x[5+idx] = s6;
326553cca76cSShri Abhyankar   }
326653cca76cSShri Abhyankar 
326753cca76cSShri Abhyankar   /* backward solve the upper triangular */
326853cca76cSShri Abhyankar   for (i=n-1; i>=0; i--) {
326953cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
327053cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
327153cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
327253cca76cSShri Abhyankar     idt = bs*i;
327353cca76cSShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
327453cca76cSShri Abhyankar     s5  = x[4+idt];s6 = x[5+idt];
327553cca76cSShri Abhyankar     for (k=0; k<nz; k++) {
327653cca76cSShri Abhyankar       idx = bs*vi[k];
327753cca76cSShri Abhyankar       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
327853cca76cSShri Abhyankar       x5  = x[4+idx];x6 = x[5+idx];
327953cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
328053cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
328153cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
328253cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
328353cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
328453cca76cSShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
328553cca76cSShri Abhyankar       v  +=  bs2;
328653cca76cSShri Abhyankar     }
328753cca76cSShri Abhyankar     /* x = inv_diagonal*x */
328853cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
328953cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
329053cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
329153cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
329253cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
329353cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
329453cca76cSShri Abhyankar   }
329553cca76cSShri Abhyankar 
32963649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
329753cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
329853cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
329953cca76cSShri Abhyankar   PetscFunctionReturn(0);
330053cca76cSShri Abhyankar }
330153cca76cSShri Abhyankar 
330253cca76cSShri Abhyankar #undef __FUNCT__
330306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
330406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
33054e2b4712SSatish Balay {
33064e2b4712SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
33074e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
33086849ba73SBarry Smith   PetscErrorCode    ierr;
33095d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3310b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3311b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
3312d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3313d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3314d9fead3dSBarry Smith   const PetscScalar *b;
33154e2b4712SSatish Balay 
33164e2b4712SSatish Balay   PetscFunctionBegin;
33173649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
33181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3319f1af5d2fSBarry Smith   t    = a->solve_work;
33204e2b4712SSatish Balay 
33214e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
33224e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
33234e2b4712SSatish Balay 
33244e2b4712SSatish Balay   /* forward solve the lower triangular */
33254e2b4712SSatish Balay   idx  = 5*(*r++);
3326f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3327f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
33284e2b4712SSatish Balay   for (i=1; i<n; i++) {
33294e2b4712SSatish Balay     v   = aa + 25*ai[i];
33304e2b4712SSatish Balay     vi  = aj + ai[i];
33314e2b4712SSatish Balay     nz  = diag[i] - ai[i];
33324e2b4712SSatish Balay     idx = 5*(*r++);
3333f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3334f1af5d2fSBarry Smith     s5  = b[4+idx];
33354e2b4712SSatish Balay     while (nz--) {
33364e2b4712SSatish Balay       idx = 5*(*vi++);
3337f1af5d2fSBarry Smith       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3338f1af5d2fSBarry Smith       x4  = t[3+idx];x5 = t[4+idx];
3339f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3340f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3341f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3342f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3343f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
33444e2b4712SSatish Balay       v  += 25;
33454e2b4712SSatish Balay     }
33464e2b4712SSatish Balay     idx      = 5*i;
3347f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3348f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
33494e2b4712SSatish Balay   }
33504e2b4712SSatish Balay   /* backward solve the upper triangular */
33514e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
33524e2b4712SSatish Balay     v   = aa + 25*diag[i] + 25;
33534e2b4712SSatish Balay     vi  = aj + diag[i] + 1;
33544e2b4712SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
33554e2b4712SSatish Balay     idt = 5*i;
3356f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt];
3357f1af5d2fSBarry Smith     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
33584e2b4712SSatish Balay     while (nz--) {
33594e2b4712SSatish Balay       idx = 5*(*vi++);
3360f1af5d2fSBarry Smith       x1  = t[idx];   x2 = t[1+idx];
3361f1af5d2fSBarry Smith       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3362f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3363f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3364f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3365f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3366f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
33674e2b4712SSatish Balay       v  += 25;
33684e2b4712SSatish Balay     }
33694e2b4712SSatish Balay     idc    = 5*(*c--);
33704e2b4712SSatish Balay     v      = aa + 25*diag[i];
3371f1af5d2fSBarry Smith     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3372f1af5d2fSBarry Smith                         v[15]*s4+v[20]*s5;
3373f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3374f1af5d2fSBarry Smith                           v[16]*s4+v[21]*s5;
3375f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3376f1af5d2fSBarry Smith                           v[17]*s4+v[22]*s5;
3377f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3378f1af5d2fSBarry Smith                           v[18]*s4+v[23]*s5;
3379f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3380f1af5d2fSBarry Smith                           v[19]*s4+v[24]*s5;
33814e2b4712SSatish Balay   }
33824e2b4712SSatish Balay 
33834e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
33844e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
33853649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
33861ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3387dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
33884e2b4712SSatish Balay   PetscFunctionReturn(0);
33894e2b4712SSatish Balay }
33904e2b4712SSatish Balay 
339178bb4007SShri Abhyankar #undef __FUNCT__
33924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5"
33934dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
339478bb4007SShri Abhyankar {
339578bb4007SShri Abhyankar   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
339678bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
339778bb4007SShri Abhyankar   PetscErrorCode    ierr;
339878bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3399b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3400b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
340178bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
340278bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
340378bb4007SShri Abhyankar   const PetscScalar *b;
340478bb4007SShri Abhyankar 
340578bb4007SShri Abhyankar   PetscFunctionBegin;
34063649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
340778bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
340878bb4007SShri Abhyankar   t    = a->solve_work;
340978bb4007SShri Abhyankar 
341078bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
341178bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
341278bb4007SShri Abhyankar 
341378bb4007SShri Abhyankar   /* forward solve the lower triangular */
341478bb4007SShri Abhyankar   idx  = 5*r[0];
341578bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
341678bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
341778bb4007SShri Abhyankar   for (i=1; i<n; i++) {
341878bb4007SShri Abhyankar     v   = aa + 25*ai[i];
341978bb4007SShri Abhyankar     vi  = aj + ai[i];
342078bb4007SShri Abhyankar     nz  = ai[i+1] - ai[i];
342178bb4007SShri Abhyankar     idx = 5*r[i];
342278bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
342378bb4007SShri Abhyankar     s5  = b[4+idx];
342478bb4007SShri Abhyankar     for (m=0; m<nz; m++) {
342578bb4007SShri Abhyankar       idx = 5*vi[m];
342678bb4007SShri Abhyankar       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
342778bb4007SShri Abhyankar       x4  = t[3+idx];x5 = t[4+idx];
342878bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
342978bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
343078bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
343178bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
343278bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
343378bb4007SShri Abhyankar       v  += 25;
343478bb4007SShri Abhyankar     }
343578bb4007SShri Abhyankar     idx      = 5*i;
343678bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
343778bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
343878bb4007SShri Abhyankar   }
343978bb4007SShri Abhyankar   /* backward solve the upper triangular */
344078bb4007SShri Abhyankar   for (i=n-1; i>=0; i--) {
344178bb4007SShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
344278bb4007SShri Abhyankar     vi  = aj + adiag[i+1]+1;
344378bb4007SShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
344478bb4007SShri Abhyankar     idt = 5*i;
344578bb4007SShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];
344678bb4007SShri Abhyankar     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
344778bb4007SShri Abhyankar     for (m=0; m<nz; m++) {
344878bb4007SShri Abhyankar       idx = 5*vi[m];
344978bb4007SShri Abhyankar       x1  = t[idx];   x2 = t[1+idx];
345078bb4007SShri Abhyankar       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
345178bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
345278bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
345378bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
345478bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
345578bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
345678bb4007SShri Abhyankar       v  += 25;
345778bb4007SShri Abhyankar     }
345878bb4007SShri Abhyankar     idc    = 5*c[i];
345978bb4007SShri Abhyankar     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
346078bb4007SShri Abhyankar                         v[15]*s4+v[20]*s5;
346178bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
346278bb4007SShri Abhyankar                           v[16]*s4+v[21]*s5;
346378bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
346478bb4007SShri Abhyankar                           v[17]*s4+v[22]*s5;
346578bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
346678bb4007SShri Abhyankar                           v[18]*s4+v[23]*s5;
346778bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
346878bb4007SShri Abhyankar                           v[19]*s4+v[24]*s5;
346978bb4007SShri Abhyankar   }
347078bb4007SShri Abhyankar 
347178bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
347278bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
34733649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
347478bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
347578bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
347678bb4007SShri Abhyankar   PetscFunctionReturn(0);
347778bb4007SShri Abhyankar }
347878bb4007SShri Abhyankar 
34798f690400SShri Abhyankar #undef __FUNCT__
348006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
348106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
348215091d37SBarry Smith {
348315091d37SBarry Smith   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3484b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3485b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3486dfbe8321SBarry Smith   PetscErrorCode    ierr;
3487d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3488d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3489d9fead3dSBarry Smith   const PetscScalar *b;
349015091d37SBarry Smith 
349115091d37SBarry Smith   PetscFunctionBegin;
34923649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
34931ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
349415091d37SBarry Smith   /* forward solve the lower triangular */
349515091d37SBarry Smith   idx  = 0;
349615091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
349715091d37SBarry Smith   for (i=1; i<n; i++) {
349815091d37SBarry Smith     v   =  aa + 25*ai[i];
349915091d37SBarry Smith     vi  =  aj + ai[i];
350015091d37SBarry Smith     nz  =  diag[i] - ai[i];
350115091d37SBarry Smith     idx =  5*i;
3502f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
350315091d37SBarry Smith     while (nz--) {
350415091d37SBarry Smith       jdx = 5*(*vi++);
350515091d37SBarry Smith       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3506f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3507f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3508f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3509f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3510f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
351115091d37SBarry Smith       v  += 25;
351215091d37SBarry Smith     }
3513f1af5d2fSBarry Smith     x[idx]   = s1;
3514f1af5d2fSBarry Smith     x[1+idx] = s2;
3515f1af5d2fSBarry Smith     x[2+idx] = s3;
3516f1af5d2fSBarry Smith     x[3+idx] = s4;
3517f1af5d2fSBarry Smith     x[4+idx] = s5;
351815091d37SBarry Smith   }
351915091d37SBarry Smith   /* backward solve the upper triangular */
352015091d37SBarry Smith   for (i=n-1; i>=0; i--) {
352115091d37SBarry Smith     v   = aa + 25*diag[i] + 25;
352215091d37SBarry Smith     vi  = aj + diag[i] + 1;
352315091d37SBarry Smith     nz  = ai[i+1] - diag[i] - 1;
352415091d37SBarry Smith     idt = 5*i;
3525f1af5d2fSBarry Smith     s1  = x[idt];  s2 = x[1+idt];
3526f1af5d2fSBarry Smith     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
352715091d37SBarry Smith     while (nz--) {
352815091d37SBarry Smith       idx = 5*(*vi++);
352915091d37SBarry Smith       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3530f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3531f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3532f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3533f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3534f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
353515091d37SBarry Smith       v  += 25;
353615091d37SBarry Smith     }
353715091d37SBarry Smith     v        = aa + 25*diag[i];
3538f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3539f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3540f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3541f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3542f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
354315091d37SBarry Smith   }
354415091d37SBarry Smith 
35453649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
35461ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3547dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
354815091d37SBarry Smith   PetscFunctionReturn(0);
354915091d37SBarry Smith }
355015091d37SBarry Smith 
3551cee9d6f2SShri Abhyankar #undef __FUNCT__
35524dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
35534dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
355453cca76cSShri Abhyankar {
355553cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3556b3260449SShri Abhyankar   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3557b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,idt,jdx;
355853cca76cSShri Abhyankar   PetscErrorCode    ierr;
355953cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
356053cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
356153cca76cSShri Abhyankar   const PetscScalar *b;
356253cca76cSShri Abhyankar 
356353cca76cSShri Abhyankar   PetscFunctionBegin;
35643649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
356553cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
356653cca76cSShri Abhyankar   /* forward solve the lower triangular */
356753cca76cSShri Abhyankar   idx  = 0;
356853cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
356953cca76cSShri Abhyankar   for (i=1; i<n; i++) {
357053cca76cSShri Abhyankar     v   = aa + 25*ai[i];
357153cca76cSShri Abhyankar     vi  = aj + ai[i];
357253cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
357353cca76cSShri Abhyankar     idx = 5*i;
357453cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
357553cca76cSShri Abhyankar     for (k=0; k<nz; k++) {
357653cca76cSShri Abhyankar       jdx = 5*vi[k];
357753cca76cSShri Abhyankar       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
357853cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
357953cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
358053cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
358153cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
358253cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
358353cca76cSShri Abhyankar       v  += 25;
358453cca76cSShri Abhyankar     }
358553cca76cSShri Abhyankar     x[idx]   = s1;
358653cca76cSShri Abhyankar     x[1+idx] = s2;
358753cca76cSShri Abhyankar     x[2+idx] = s3;
358853cca76cSShri Abhyankar     x[3+idx] = s4;
358953cca76cSShri Abhyankar     x[4+idx] = s5;
359053cca76cSShri Abhyankar   }
359153cca76cSShri Abhyankar 
359253cca76cSShri Abhyankar   /* backward solve the upper triangular */
359353cca76cSShri Abhyankar   for (i=n-1; i>=0; i--) {
359453cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
359553cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
359653cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
359753cca76cSShri Abhyankar     idt = 5*i;
359853cca76cSShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];
359953cca76cSShri Abhyankar     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360053cca76cSShri Abhyankar     for (k=0; k<nz; k++) {
360153cca76cSShri Abhyankar       idx = 5*vi[k];
360253cca76cSShri Abhyankar       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
360353cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
360453cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
360553cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
360653cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
360753cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
360853cca76cSShri Abhyankar       v  += 25;
360953cca76cSShri Abhyankar     }
361053cca76cSShri Abhyankar     /* x = inv_diagonal*x */
361153cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
361253cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
361353cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
361453cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
361553cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
361653cca76cSShri Abhyankar   }
361753cca76cSShri Abhyankar 
36183649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
361953cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
362053cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
362153cca76cSShri Abhyankar   PetscFunctionReturn(0);
362253cca76cSShri Abhyankar }
362353cca76cSShri Abhyankar 
362453cca76cSShri Abhyankar #undef __FUNCT__
362506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
362606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
36274e2b4712SSatish Balay {
36284e2b4712SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
36294e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
36306849ba73SBarry Smith   PetscErrorCode    ierr;
3631b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3632b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
36335d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3634d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3635d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3636d9fead3dSBarry Smith   const PetscScalar *b;
36374e2b4712SSatish Balay 
36384e2b4712SSatish Balay   PetscFunctionBegin;
36393649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
36401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3641f1af5d2fSBarry Smith   t    = a->solve_work;
36424e2b4712SSatish Balay 
36434e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
36444e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
36454e2b4712SSatish Balay 
36464e2b4712SSatish Balay   /* forward solve the lower triangular */
36474e2b4712SSatish Balay   idx  = 4*(*r++);
3648f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3649f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
36504e2b4712SSatish Balay   for (i=1; i<n; i++) {
36514e2b4712SSatish Balay     v   = aa + 16*ai[i];
36524e2b4712SSatish Balay     vi  = aj + ai[i];
36534e2b4712SSatish Balay     nz  = diag[i] - ai[i];
36544e2b4712SSatish Balay     idx = 4*(*r++);
3655f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
36564e2b4712SSatish Balay     while (nz--) {
36574e2b4712SSatish Balay       idx = 4*(*vi++);
3658f1af5d2fSBarry Smith       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3659f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3660f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3661f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3662f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
36634e2b4712SSatish Balay       v  += 16;
36644e2b4712SSatish Balay     }
36654e2b4712SSatish Balay     idx      = 4*i;
3666f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3667f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
36684e2b4712SSatish Balay   }
36694e2b4712SSatish Balay   /* backward solve the upper triangular */
36704e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
36714e2b4712SSatish Balay     v   = aa + 16*diag[i] + 16;
36724e2b4712SSatish Balay     vi  = aj + diag[i] + 1;
36734e2b4712SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
36744e2b4712SSatish Balay     idt = 4*i;
3675f1af5d2fSBarry Smith     s1  = t[idt];  s2 = t[1+idt];
3676f1af5d2fSBarry Smith     s3  = t[2+idt];s4 = t[3+idt];
36774e2b4712SSatish Balay     while (nz--) {
36784e2b4712SSatish Balay       idx = 4*(*vi++);
3679f1af5d2fSBarry Smith       x1  = t[idx];   x2 = t[1+idx];
3680f1af5d2fSBarry Smith       x3  = t[2+idx]; x4 = t[3+idx];
3681f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3682f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3683f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3684f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
36854e2b4712SSatish Balay       v  += 16;
36864e2b4712SSatish Balay     }
36874e2b4712SSatish Balay     idc      = 4*(*c--);
36884e2b4712SSatish Balay     v        = aa + 16*diag[i];
3689f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3690f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3691f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3692f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
36934e2b4712SSatish Balay   }
36944e2b4712SSatish Balay 
36954e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
36964e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
36973649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
36981ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3699dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
37004e2b4712SSatish Balay   PetscFunctionReturn(0);
37014e2b4712SSatish Balay }
3702f26ec98cSKris Buschelman 
37038f690400SShri Abhyankar #undef __FUNCT__
37044dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4"
37054dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
370678bb4007SShri Abhyankar {
370778bb4007SShri Abhyankar   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
370878bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
370978bb4007SShri Abhyankar   PetscErrorCode    ierr;
3710b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3711b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
371278bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
371378bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
371478bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
371578bb4007SShri Abhyankar   const PetscScalar *b;
371678bb4007SShri Abhyankar 
371778bb4007SShri Abhyankar   PetscFunctionBegin;
37183649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
371978bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
372078bb4007SShri Abhyankar   t    = a->solve_work;
372178bb4007SShri Abhyankar 
372278bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
372378bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
372478bb4007SShri Abhyankar 
372578bb4007SShri Abhyankar   /* forward solve the lower triangular */
372678bb4007SShri Abhyankar   idx  = 4*r[0];
372778bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
372878bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
372978bb4007SShri Abhyankar   for (i=1; i<n; i++) {
373078bb4007SShri Abhyankar     v   = aa + 16*ai[i];
373178bb4007SShri Abhyankar     vi  = aj + ai[i];
373278bb4007SShri Abhyankar     nz  = ai[i+1] - ai[i];
373378bb4007SShri Abhyankar     idx = 4*r[i];
373478bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
373578bb4007SShri Abhyankar     for (m=0; m<nz; m++) {
373678bb4007SShri Abhyankar       idx = 4*vi[m];
373778bb4007SShri Abhyankar       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
373878bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
373978bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
374078bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
374178bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
374278bb4007SShri Abhyankar       v  += 16;
374378bb4007SShri Abhyankar     }
374478bb4007SShri Abhyankar     idx      = 4*i;
374578bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
374678bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
374778bb4007SShri Abhyankar   }
374878bb4007SShri Abhyankar   /* backward solve the upper triangular */
374978bb4007SShri Abhyankar   for (i=n-1; i>=0; i--) {
375078bb4007SShri Abhyankar     v   = aa + 16*(adiag[i+1]+1);
375178bb4007SShri Abhyankar     vi  = aj + adiag[i+1]+1;
375278bb4007SShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
375378bb4007SShri Abhyankar     idt = 4*i;
375478bb4007SShri Abhyankar     s1  = t[idt];  s2 = t[1+idt];
375578bb4007SShri Abhyankar     s3  = t[2+idt];s4 = t[3+idt];
375678bb4007SShri Abhyankar     for (m=0; m<nz; m++) {
375778bb4007SShri Abhyankar       idx = 4*vi[m];
375878bb4007SShri Abhyankar       x1  = t[idx];   x2 = t[1+idx];
375978bb4007SShri Abhyankar       x3  = t[2+idx]; x4 = t[3+idx];
376078bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
376178bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
376278bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
376378bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
376478bb4007SShri Abhyankar       v  += 16;
376578bb4007SShri Abhyankar     }
376678bb4007SShri Abhyankar     idc      = 4*c[i];
376778bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
376878bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
376978bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
377078bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
377178bb4007SShri Abhyankar   }
377278bb4007SShri Abhyankar 
377378bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
377478bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
37753649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
377678bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
377778bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
377878bb4007SShri Abhyankar   PetscFunctionReturn(0);
377978bb4007SShri Abhyankar }
378078bb4007SShri Abhyankar 
378178bb4007SShri Abhyankar #undef __FUNCT__
3782f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3783dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3784f26ec98cSKris Buschelman {
3785f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3786f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
37876849ba73SBarry Smith   PetscErrorCode    ierr;
3788b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3789b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
37905d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3791d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3792d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3793d9fead3dSBarry Smith   PetscScalar       *x;
3794d9fead3dSBarry Smith   const PetscScalar *b;
3795f26ec98cSKris Buschelman 
3796f26ec98cSKris Buschelman   PetscFunctionBegin;
37973649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
37981ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3799f26ec98cSKris Buschelman   t    = (MatScalar*)a->solve_work;
3800f26ec98cSKris Buschelman 
3801f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3802f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3803f26ec98cSKris Buschelman 
3804f26ec98cSKris Buschelman   /* forward solve the lower triangular */
3805f26ec98cSKris Buschelman   idx  = 4*(*r++);
3806f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
3807f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
3808f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
3809f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
3810f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
3811f26ec98cSKris Buschelman     v   = aa + 16*ai[i];
3812f26ec98cSKris Buschelman     vi  = aj + ai[i];
3813f26ec98cSKris Buschelman     nz  = diag[i] - ai[i];
3814f26ec98cSKris Buschelman     idx = 4*(*r++);
3815f26ec98cSKris Buschelman     s1  = (MatScalar)b[idx];
3816f26ec98cSKris Buschelman     s2  = (MatScalar)b[1+idx];
3817f26ec98cSKris Buschelman     s3  = (MatScalar)b[2+idx];
3818f26ec98cSKris Buschelman     s4  = (MatScalar)b[3+idx];
3819f26ec98cSKris Buschelman     while (nz--) {
3820f26ec98cSKris Buschelman       idx = 4*(*vi++);
3821f26ec98cSKris Buschelman       x1  = t[idx];
3822f26ec98cSKris Buschelman       x2  = t[1+idx];
3823f26ec98cSKris Buschelman       x3  = t[2+idx];
3824f26ec98cSKris Buschelman       x4  = t[3+idx];
3825f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3826f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3827f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3828f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3829f26ec98cSKris Buschelman       v  += 16;
3830f26ec98cSKris Buschelman     }
3831f26ec98cSKris Buschelman     idx      = 4*i;
3832f26ec98cSKris Buschelman     t[idx]   = s1;
3833f26ec98cSKris Buschelman     t[1+idx] = s2;
3834f26ec98cSKris Buschelman     t[2+idx] = s3;
3835f26ec98cSKris Buschelman     t[3+idx] = s4;
3836f26ec98cSKris Buschelman   }
3837f26ec98cSKris Buschelman   /* backward solve the upper triangular */
3838f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--) {
3839f26ec98cSKris Buschelman     v   = aa + 16*diag[i] + 16;
3840f26ec98cSKris Buschelman     vi  = aj + diag[i] + 1;
3841f26ec98cSKris Buschelman     nz  = ai[i+1] - diag[i] - 1;
3842f26ec98cSKris Buschelman     idt = 4*i;
3843f26ec98cSKris Buschelman     s1  = t[idt];
3844f26ec98cSKris Buschelman     s2  = t[1+idt];
3845f26ec98cSKris Buschelman     s3  = t[2+idt];
3846f26ec98cSKris Buschelman     s4  = t[3+idt];
3847f26ec98cSKris Buschelman     while (nz--) {
3848f26ec98cSKris Buschelman       idx = 4*(*vi++);
3849f26ec98cSKris Buschelman       x1  = t[idx];
3850f26ec98cSKris Buschelman       x2  = t[1+idx];
3851f26ec98cSKris Buschelman       x3  = t[2+idx];
3852f26ec98cSKris Buschelman       x4  = t[3+idx];
3853f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3854f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3855f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3856f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3857f26ec98cSKris Buschelman       v  += 16;
3858f26ec98cSKris Buschelman     }
3859f26ec98cSKris Buschelman     idc      = 4*(*c--);
3860f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3861f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3862f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3863f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3864f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3865f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3866f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3867f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3868f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3869f26ec98cSKris Buschelman   }
3870f26ec98cSKris Buschelman 
3871f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3872f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
38733649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
38741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3875dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3876f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3877f26ec98cSKris Buschelman }
3878f26ec98cSKris Buschelman 
387924c233c2SKris Buschelman #if defined(PETSC_HAVE_SSE)
388024c233c2SKris Buschelman 
388124c233c2SKris Buschelman #include PETSC_HAVE_SSE
388224c233c2SKris Buschelman 
388324c233c2SKris Buschelman #undef __FUNCT__
388424c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3885dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
388624c233c2SKris Buschelman {
388724c233c2SKris Buschelman   /*
388824c233c2SKris Buschelman      Note: This code uses demotion of double
388924c233c2SKris Buschelman      to float when performing the mixed-mode computation.
389024c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
389124c233c2SKris Buschelman   */
389224c233c2SKris Buschelman   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
389324c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
38946849ba73SBarry Smith   PetscErrorCode ierr;
38955d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
38965d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
389724c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
389887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
389924c233c2SKris Buschelman 
390024c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
390124c233c2SKris Buschelman   float         ssealignedspace[11],*tmps,*tmpx;
390224c233c2SKris Buschelman   unsigned long offset;
390324c233c2SKris Buschelman 
390424c233c2SKris Buschelman   PetscFunctionBegin;
390524c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
390624c233c2SKris Buschelman 
390724c233c2SKris Buschelman   offset = (unsigned long)ssealignedspace % 16;
390824c233c2SKris Buschelman   if (offset) offset = (16 - offset)/4;
390924c233c2SKris Buschelman   tmps = &ssealignedspace[offset];
391024c233c2SKris Buschelman   tmpx = &ssealignedspace[offset+4];
391124c233c2SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
391224c233c2SKris Buschelman 
39131ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
39141ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
391524c233c2SKris Buschelman   t    = a->solve_work;
391624c233c2SKris Buschelman 
391724c233c2SKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
391824c233c2SKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
391924c233c2SKris Buschelman 
392024c233c2SKris Buschelman   /* forward solve the lower triangular */
392124c233c2SKris Buschelman   idx  = 4*(*r++);
392224c233c2SKris Buschelman   t[0] = b[idx];   t[1] = b[1+idx];
392324c233c2SKris Buschelman   t[2] = b[2+idx]; t[3] = b[3+idx];
392424c233c2SKris Buschelman   v    =  aa + 16*ai[1];
392524c233c2SKris Buschelman 
392624c233c2SKris Buschelman   for (i=1; i<n; ) {
392724c233c2SKris Buschelman     PREFETCH_NTA(&v[8]);
392824c233c2SKris Buschelman     vi  =  aj      + ai[i];
392924c233c2SKris Buschelman     nz  =  diag[i] - ai[i];
393024c233c2SKris Buschelman     idx =  4*(*r++);
393124c233c2SKris Buschelman 
393224c233c2SKris Buschelman     /* Demote sum from double to float */
393324c233c2SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
393424c233c2SKris Buschelman     LOAD_PS(tmps,XMM7);
393524c233c2SKris Buschelman 
393624c233c2SKris Buschelman     while (nz--) {
393724c233c2SKris Buschelman       PREFETCH_NTA(&v[16]);
393824c233c2SKris Buschelman       idx = 4*(*vi++);
393924c233c2SKris Buschelman 
394024c233c2SKris Buschelman       /* Demote solution (so far) from double to float */
394124c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
394224c233c2SKris Buschelman 
394324c233c2SKris Buschelman       /* 4x4 Matrix-Vector product with negative accumulation: */
394424c233c2SKris Buschelman       SSE_INLINE_BEGIN_2(tmpx,v)
394524c233c2SKris Buschelman       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
394624c233c2SKris Buschelman 
394724c233c2SKris Buschelman       /* First Column */
394824c233c2SKris Buschelman       SSE_COPY_PS(XMM0,XMM6)
394924c233c2SKris Buschelman       SSE_SHUFFLE(XMM0,XMM0,0x00)
395024c233c2SKris Buschelman       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
395124c233c2SKris Buschelman       SSE_SUB_PS(XMM7,XMM0)
395224c233c2SKris Buschelman 
395324c233c2SKris Buschelman       /* Second Column */
395424c233c2SKris Buschelman       SSE_COPY_PS(XMM1,XMM6)
395524c233c2SKris Buschelman       SSE_SHUFFLE(XMM1,XMM1,0x55)
395624c233c2SKris Buschelman       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
395724c233c2SKris Buschelman       SSE_SUB_PS(XMM7,XMM1)
395824c233c2SKris Buschelman 
395924c233c2SKris Buschelman       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
396024c233c2SKris Buschelman 
396124c233c2SKris Buschelman       /* Third Column */
396224c233c2SKris Buschelman       SSE_COPY_PS(XMM2,XMM6)
396324c233c2SKris Buschelman       SSE_SHUFFLE(XMM2,XMM2,0xAA)
396424c233c2SKris Buschelman       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
396524c233c2SKris Buschelman       SSE_SUB_PS(XMM7,XMM2)
396624c233c2SKris Buschelman 
396724c233c2SKris Buschelman       /* Fourth Column */
396824c233c2SKris Buschelman       SSE_COPY_PS(XMM3,XMM6)
396924c233c2SKris Buschelman       SSE_SHUFFLE(XMM3,XMM3,0xFF)
397024c233c2SKris Buschelman       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
397124c233c2SKris Buschelman       SSE_SUB_PS(XMM7,XMM3)
397224c233c2SKris Buschelman       SSE_INLINE_END_2
397324c233c2SKris Buschelman 
397424c233c2SKris Buschelman       v += 16;
397524c233c2SKris Buschelman     }
397624c233c2SKris Buschelman     idx = 4*i;
397724c233c2SKris Buschelman     v   = aa + 16*ai[++i];
397824c233c2SKris Buschelman     PREFETCH_NTA(v);
397924c233c2SKris Buschelman     STORE_PS(tmps,XMM7);
398024c233c2SKris Buschelman 
398124c233c2SKris Buschelman     /* Promote result from float to double */
398224c233c2SKris Buschelman     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
398324c233c2SKris Buschelman   }
398424c233c2SKris Buschelman   /* backward solve the upper triangular */
398524c233c2SKris Buschelman   idt  = 4*(n-1);
398624c233c2SKris Buschelman   ai16 = 16*diag[n-1];
398724c233c2SKris Buschelman   v    = aa + ai16 + 16;
398824c233c2SKris Buschelman   for (i=n-1; i>=0; ) {
398924c233c2SKris Buschelman     PREFETCH_NTA(&v[8]);
399024c233c2SKris Buschelman     vi = aj + diag[i] + 1;
399124c233c2SKris Buschelman     nz = ai[i+1] - diag[i] - 1;
399224c233c2SKris Buschelman 
399324c233c2SKris Buschelman     /* Demote accumulator from double to float */
399424c233c2SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
399524c233c2SKris Buschelman     LOAD_PS(tmps,XMM7);
399624c233c2SKris Buschelman 
399724c233c2SKris Buschelman     while (nz--) {
399824c233c2SKris Buschelman       PREFETCH_NTA(&v[16]);
399924c233c2SKris Buschelman       idx = 4*(*vi++);
400024c233c2SKris Buschelman 
400124c233c2SKris Buschelman       /* Demote solution (so far) from double to float */
400224c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
400324c233c2SKris Buschelman 
400424c233c2SKris Buschelman       /* 4x4 Matrix-Vector Product with negative accumulation: */
400524c233c2SKris Buschelman       SSE_INLINE_BEGIN_2(tmpx,v)
400624c233c2SKris Buschelman       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
400724c233c2SKris Buschelman 
400824c233c2SKris Buschelman       /* First Column */
400924c233c2SKris Buschelman       SSE_COPY_PS(XMM0,XMM6)
401024c233c2SKris Buschelman       SSE_SHUFFLE(XMM0,XMM0,0x00)
401124c233c2SKris Buschelman       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
401224c233c2SKris Buschelman       SSE_SUB_PS(XMM7,XMM0)
401324c233c2SKris Buschelman 
401424c233c2SKris Buschelman       /* Second Column */
401524c233c2SKris Buschelman       SSE_COPY_PS(XMM1,XMM6)
401624c233c2SKris Buschelman       SSE_SHUFFLE(XMM1,XMM1,0x55)
401724c233c2SKris Buschelman       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
401824c233c2SKris Buschelman       SSE_SUB_PS(XMM7,XMM1)
401924c233c2SKris Buschelman 
402024c233c2SKris Buschelman       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
402124c233c2SKris Buschelman 
402224c233c2SKris Buschelman       /* Third Column */
402324c233c2SKris Buschelman       SSE_COPY_PS(XMM2,XMM6)
402424c233c2SKris Buschelman       SSE_SHUFFLE(XMM2,XMM2,0xAA)
402524c233c2SKris Buschelman       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
402624c233c2SKris Buschelman       SSE_SUB_PS(XMM7,XMM2)
402724c233c2SKris Buschelman 
402824c233c2SKris Buschelman       /* Fourth Column */
402924c233c2SKris Buschelman       SSE_COPY_PS(XMM3,XMM6)
403024c233c2SKris Buschelman       SSE_SHUFFLE(XMM3,XMM3,0xFF)
403124c233c2SKris Buschelman       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
403224c233c2SKris Buschelman       SSE_SUB_PS(XMM7,XMM3)
403324c233c2SKris Buschelman       SSE_INLINE_END_2
403424c233c2SKris Buschelman       v += 16;
403524c233c2SKris Buschelman     }
403624c233c2SKris Buschelman     v    = aa + ai16;
403724c233c2SKris Buschelman     ai16 = 16*diag[--i];
403824c233c2SKris Buschelman     PREFETCH_NTA(aa+ai16+16);
403924c233c2SKris Buschelman     /*
404024c233c2SKris Buschelman        Scale the result by the diagonal 4x4 block,
404124c233c2SKris Buschelman        which was inverted as part of the factorization
404224c233c2SKris Buschelman     */
404324c233c2SKris Buschelman     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
404424c233c2SKris Buschelman     /* First Column */
404524c233c2SKris Buschelman     SSE_COPY_PS(XMM0,XMM7)
404624c233c2SKris Buschelman     SSE_SHUFFLE(XMM0,XMM0,0x00)
404724c233c2SKris Buschelman     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
404824c233c2SKris Buschelman 
404924c233c2SKris Buschelman     /* Second Column */
405024c233c2SKris Buschelman     SSE_COPY_PS(XMM1,XMM7)
405124c233c2SKris Buschelman     SSE_SHUFFLE(XMM1,XMM1,0x55)
405224c233c2SKris Buschelman     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
405324c233c2SKris Buschelman     SSE_ADD_PS(XMM0,XMM1)
405424c233c2SKris Buschelman 
405524c233c2SKris Buschelman     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
405624c233c2SKris Buschelman 
405724c233c2SKris Buschelman     /* Third Column */
405824c233c2SKris Buschelman     SSE_COPY_PS(XMM2,XMM7)
405924c233c2SKris Buschelman     SSE_SHUFFLE(XMM2,XMM2,0xAA)
406024c233c2SKris Buschelman     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
406124c233c2SKris Buschelman     SSE_ADD_PS(XMM0,XMM2)
406224c233c2SKris Buschelman 
406324c233c2SKris Buschelman     /* Fourth Column */
406424c233c2SKris Buschelman     SSE_COPY_PS(XMM3,XMM7)
406524c233c2SKris Buschelman     SSE_SHUFFLE(XMM3,XMM3,0xFF)
406624c233c2SKris Buschelman     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
406724c233c2SKris Buschelman     SSE_ADD_PS(XMM0,XMM3)
406824c233c2SKris Buschelman 
406924c233c2SKris Buschelman     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
407024c233c2SKris Buschelman     SSE_INLINE_END_3
407124c233c2SKris Buschelman 
407224c233c2SKris Buschelman     /* Promote solution from float to double */
407324c233c2SKris Buschelman     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
407424c233c2SKris Buschelman 
407524c233c2SKris Buschelman     /* Apply reordering to t and stream into x.    */
407624c233c2SKris Buschelman     /* This way, x doesn't pollute the cache.      */
407724c233c2SKris Buschelman     /* Be careful with size: 2 doubles = 4 floats! */
407824c233c2SKris Buschelman     idc = 4*(*c--);
407924c233c2SKris Buschelman     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
408024c233c2SKris Buschelman     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
408124c233c2SKris Buschelman     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
408224c233c2SKris Buschelman     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
408324c233c2SKris Buschelman     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
408424c233c2SKris Buschelman     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
408524c233c2SKris Buschelman     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
408624c233c2SKris Buschelman     SSE_INLINE_END_2
408724c233c2SKris Buschelman     v    = aa + ai16 + 16;
408824c233c2SKris Buschelman     idt -= 4;
408924c233c2SKris Buschelman   }
409024c233c2SKris Buschelman 
409124c233c2SKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
409224c233c2SKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
40931ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
40941ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4095dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
409624c233c2SKris Buschelman   SSE_SCOPE_END;
409724c233c2SKris Buschelman   PetscFunctionReturn(0);
409824c233c2SKris Buschelman }
409924c233c2SKris Buschelman 
410024c233c2SKris Buschelman #endif
41010ef38995SBarry Smith 
41020ef38995SBarry Smith 
41034e2b4712SSatish Balay /*
41044e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
41054e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
41064e2b4712SSatish Balay */
41074a2ae208SSatish Balay #undef __FUNCT__
410806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
410906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
41104e2b4712SSatish Balay {
41114e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4112356650c2SBarry Smith   PetscInt          n  =a->mbs;
4113356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
4114dfbe8321SBarry Smith   PetscErrorCode    ierr;
4115356650c2SBarry Smith   const PetscInt    *diag = a->diag;
4116d9fead3dSBarry Smith   const MatScalar   *aa   =a->a;
4117d9fead3dSBarry Smith   PetscScalar       *x;
4118d9fead3dSBarry Smith   const PetscScalar *b;
41194e2b4712SSatish Balay 
41204e2b4712SSatish Balay   PetscFunctionBegin;
41213649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
41221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
41234e2b4712SSatish Balay 
4124aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
41252853dc0eSBarry Smith   {
412687828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41272853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
41282853dc0eSBarry Smith   }
4129aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
41302853dc0eSBarry Smith   {
413187828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41322853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
41332853dc0eSBarry Smith   }
4134aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
41352853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4136e1293385SBarry Smith #else
413730d4dcafSBarry Smith   {
413887828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4139d9fead3dSBarry Smith     const MatScalar *v;
4140356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
4141356650c2SBarry Smith     const PetscInt  *vi;
4142e1293385SBarry Smith 
41434e2b4712SSatish Balay     /* forward solve the lower triangular */
41444e2b4712SSatish Balay     idx  = 0;
4145e1293385SBarry Smith     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
41464e2b4712SSatish Balay     for (i=1; i<n; i++) {
41474e2b4712SSatish Balay       v    =  aa      + 16*ai[i];
41484e2b4712SSatish Balay       vi   =  aj      + ai[i];
41494e2b4712SSatish Balay       nz   =  diag[i] - ai[i];
4150e1293385SBarry Smith       idx +=  4;
4151f1af5d2fSBarry Smith       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
41524e2b4712SSatish Balay       while (nz--) {
41534e2b4712SSatish Balay         jdx = 4*(*vi++);
41544e2b4712SSatish Balay         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4155f1af5d2fSBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4156f1af5d2fSBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4157f1af5d2fSBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4158f1af5d2fSBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
41594e2b4712SSatish Balay         v  += 16;
41604e2b4712SSatish Balay       }
4161f1af5d2fSBarry Smith       x[idx]   = s1;
4162f1af5d2fSBarry Smith       x[1+idx] = s2;
4163f1af5d2fSBarry Smith       x[2+idx] = s3;
4164f1af5d2fSBarry Smith       x[3+idx] = s4;
41654e2b4712SSatish Balay     }
41664e2b4712SSatish Balay     /* backward solve the upper triangular */
41674e555682SBarry Smith     idt = 4*(n-1);
41684e2b4712SSatish Balay     for (i=n-1; i>=0; i--) {
41694e555682SBarry Smith       ai16 = 16*diag[i];
41704e555682SBarry Smith       v    = aa + ai16 + 16;
41714e2b4712SSatish Balay       vi   = aj + diag[i] + 1;
41724e2b4712SSatish Balay       nz   = ai[i+1] - diag[i] - 1;
4173f1af5d2fSBarry Smith       s1   = x[idt];  s2 = x[1+idt];
4174f1af5d2fSBarry Smith       s3   = x[2+idt];s4 = x[3+idt];
41754e2b4712SSatish Balay       while (nz--) {
41764e2b4712SSatish Balay         idx = 4*(*vi++);
41774e2b4712SSatish Balay         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4178f1af5d2fSBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4179f1af5d2fSBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4180f1af5d2fSBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4181f1af5d2fSBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
41824e2b4712SSatish Balay         v  += 16;
41834e2b4712SSatish Balay       }
41844e555682SBarry Smith       v        = aa + ai16;
4185f1af5d2fSBarry Smith       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4186f1af5d2fSBarry Smith       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4187f1af5d2fSBarry Smith       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4188f1af5d2fSBarry Smith       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4189329f5518SBarry Smith       idt     -= 4;
41904e2b4712SSatish Balay     }
419130d4dcafSBarry Smith   }
4192e1293385SBarry Smith #endif
41934e2b4712SSatish Balay 
41943649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
41951ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4196dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
41974e2b4712SSatish Balay   PetscFunctionReturn(0);
41984e2b4712SSatish Balay }
41994e2b4712SSatish Balay 
4200b2b2dd24SShri Abhyankar #undef __FUNCT__
42014dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
42024dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4203b2b2dd24SShri Abhyankar {
4204b2b2dd24SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4205b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4206b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,jdx,idt;
4207b2b2dd24SShri Abhyankar   PetscErrorCode    ierr;
4208b3260449SShri Abhyankar   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4209b2b2dd24SShri Abhyankar   const MatScalar   *aa=a->a,*v;
4210b2b2dd24SShri Abhyankar   PetscScalar       *x;
4211b2b2dd24SShri Abhyankar   const PetscScalar *b;
4212b2b2dd24SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4213cee9d6f2SShri Abhyankar 
4214b2b2dd24SShri Abhyankar   PetscFunctionBegin;
42153649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4216b2b2dd24SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4217b2b2dd24SShri Abhyankar   /* forward solve the lower triangular */
4218b2b2dd24SShri Abhyankar   idx  = 0;
4219b2b2dd24SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4220b2b2dd24SShri Abhyankar   for (i=1; i<n; i++) {
4221b2b2dd24SShri Abhyankar     v   = aa + bs2*ai[i];
4222b2b2dd24SShri Abhyankar     vi  = aj + ai[i];
4223b2b2dd24SShri Abhyankar     nz  = ai[i+1] - ai[i];
4224b2b2dd24SShri Abhyankar     idx = bs*i;
4225b2b2dd24SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4226b2b2dd24SShri Abhyankar     for (k=0; k<nz; k++) {
4227b2b2dd24SShri Abhyankar       jdx = bs*vi[k];
4228b2b2dd24SShri Abhyankar       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4229b2b2dd24SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4230b2b2dd24SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4231b2b2dd24SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4232b2b2dd24SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4233b2b2dd24SShri Abhyankar 
4234b2b2dd24SShri Abhyankar       v +=  bs2;
4235b2b2dd24SShri Abhyankar     }
4236b2b2dd24SShri Abhyankar 
4237b2b2dd24SShri Abhyankar     x[idx]   = s1;
4238b2b2dd24SShri Abhyankar     x[1+idx] = s2;
4239b2b2dd24SShri Abhyankar     x[2+idx] = s3;
4240b2b2dd24SShri Abhyankar     x[3+idx] = s4;
4241b2b2dd24SShri Abhyankar   }
4242b2b2dd24SShri Abhyankar 
4243b2b2dd24SShri Abhyankar   /* backward solve the upper triangular */
4244b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--) {
4245b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4246b2b2dd24SShri Abhyankar     vi  = aj + adiag[i+1]+1;
4247b2b2dd24SShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
4248b2b2dd24SShri Abhyankar     idt = bs*i;
4249b2b2dd24SShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4250b2b2dd24SShri Abhyankar 
4251b2b2dd24SShri Abhyankar     for (k=0; k<nz; k++) {
4252b2b2dd24SShri Abhyankar       idx = bs*vi[k];
4253b2b2dd24SShri Abhyankar       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4254b2b2dd24SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4255b2b2dd24SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4256b2b2dd24SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4257b2b2dd24SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4258b2b2dd24SShri Abhyankar 
4259b2b2dd24SShri Abhyankar       v +=  bs2;
4260b2b2dd24SShri Abhyankar     }
4261b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4262b2b2dd24SShri Abhyankar     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4263b2b2dd24SShri Abhyankar     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4264b2b2dd24SShri Abhyankar     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4265b2b2dd24SShri Abhyankar     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4266b2b2dd24SShri Abhyankar 
4267b2b2dd24SShri Abhyankar   }
4268b2b2dd24SShri Abhyankar 
42693649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4270b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4271b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4272b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4273b2b2dd24SShri Abhyankar }
4274cee9d6f2SShri Abhyankar 
4275cee9d6f2SShri Abhyankar #undef __FUNCT__
4276f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4277dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4278f26ec98cSKris Buschelman {
4279f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4280b3260449SShri Abhyankar   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4281dfbe8321SBarry Smith   PetscErrorCode    ierr;
4282b3260449SShri Abhyankar   const MatScalar   *aa=a->a;
4283b3260449SShri Abhyankar   const PetscScalar *b;
4284b3260449SShri Abhyankar   PetscScalar       *x;
4285f26ec98cSKris Buschelman 
4286f26ec98cSKris Buschelman   PetscFunctionBegin;
42873649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
42881ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4289f26ec98cSKris Buschelman 
4290f26ec98cSKris Buschelman   {
4291f26ec98cSKris Buschelman     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4292b3260449SShri Abhyankar     const MatScalar *v;
4293b3260449SShri Abhyankar     MatScalar       *t=(MatScalar*)x;
4294b3260449SShri Abhyankar     PetscInt        jdx,idt,idx,nz,i,ai16;
4295b3260449SShri Abhyankar     const PetscInt  *vi;
4296f26ec98cSKris Buschelman 
4297f26ec98cSKris Buschelman     /* forward solve the lower triangular */
4298f26ec98cSKris Buschelman     idx  = 0;
4299f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
4300f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
4301f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
4302f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
4303f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
4304f26ec98cSKris Buschelman       v    =  aa      + 16*ai[i];
4305f26ec98cSKris Buschelman       vi   =  aj      + ai[i];
4306f26ec98cSKris Buschelman       nz   =  diag[i] - ai[i];
4307f26ec98cSKris Buschelman       idx +=  4;
4308f26ec98cSKris Buschelman       s1   = (MatScalar)b[idx];
4309f26ec98cSKris Buschelman       s2   = (MatScalar)b[1+idx];
4310f26ec98cSKris Buschelman       s3   = (MatScalar)b[2+idx];
4311f26ec98cSKris Buschelman       s4   = (MatScalar)b[3+idx];
4312f26ec98cSKris Buschelman       while (nz--) {
4313f26ec98cSKris Buschelman         jdx = 4*(*vi++);
4314f26ec98cSKris Buschelman         x1  = t[jdx];
4315f26ec98cSKris Buschelman         x2  = t[1+jdx];
4316f26ec98cSKris Buschelman         x3  = t[2+jdx];
4317f26ec98cSKris Buschelman         x4  = t[3+jdx];
4318f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4319f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4320f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4321f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4322f26ec98cSKris Buschelman         v  += 16;
4323f26ec98cSKris Buschelman       }
4324f26ec98cSKris Buschelman       t[idx]   = s1;
4325f26ec98cSKris Buschelman       t[1+idx] = s2;
4326f26ec98cSKris Buschelman       t[2+idx] = s3;
4327f26ec98cSKris Buschelman       t[3+idx] = s4;
4328f26ec98cSKris Buschelman     }
4329f26ec98cSKris Buschelman     /* backward solve the upper triangular */
4330f26ec98cSKris Buschelman     idt = 4*(n-1);
4331f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--) {
4332f26ec98cSKris Buschelman       ai16 = 16*diag[i];
4333f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
4334f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
4335f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
4336f26ec98cSKris Buschelman       s1   = t[idt];
4337f26ec98cSKris Buschelman       s2   = t[1+idt];
4338f26ec98cSKris Buschelman       s3   = t[2+idt];
4339f26ec98cSKris Buschelman       s4   = t[3+idt];
4340f26ec98cSKris Buschelman       while (nz--) {
4341f26ec98cSKris Buschelman         idx = 4*(*vi++);
4342f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
4343f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
4344f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
4345f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
4346f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4347f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4348f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4349f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4350f26ec98cSKris Buschelman         v  += 16;
4351f26ec98cSKris Buschelman       }
4352f26ec98cSKris Buschelman       v        = aa + ai16;
4353f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4354f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4355f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4356f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4357f26ec98cSKris Buschelman       idt     -= 4;
4358f26ec98cSKris Buschelman     }
4359f26ec98cSKris Buschelman   }
4360f26ec98cSKris Buschelman 
43613649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
43621ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4363dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4364f26ec98cSKris Buschelman   PetscFunctionReturn(0);
4365f26ec98cSKris Buschelman }
4366f26ec98cSKris Buschelman 
43673660e330SKris Buschelman #if defined(PETSC_HAVE_SSE)
43683660e330SKris Buschelman 
43693660e330SKris Buschelman #include PETSC_HAVE_SSE
43703660e330SKris Buschelman #undef __FUNCT__
43717cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4372dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
43733660e330SKris Buschelman {
43743660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
43752aa5897fSKris Buschelman   unsigned short *aj=(unsigned short*)a->j;
4376dfbe8321SBarry Smith   PetscErrorCode ierr;
4377dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
43783660e330SKris Buschelman   MatScalar      *aa=a->a;
437987828ca2SBarry Smith   PetscScalar    *x,*b;
43803660e330SKris Buschelman 
43813660e330SKris Buschelman   PetscFunctionBegin;
43823660e330SKris Buschelman   SSE_SCOPE_BEGIN;
43833660e330SKris Buschelman   /*
43843660e330SKris Buschelman      Note: This code currently uses demotion of double
43853660e330SKris Buschelman      to float when performing the mixed-mode computation.
43863660e330SKris Buschelman      This may not be numerically reasonable for all applications.
43873660e330SKris Buschelman   */
43883660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
43893660e330SKris Buschelman 
43901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
43911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
43923660e330SKris Buschelman   {
4393eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
4394eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar*)x;
43952aa5897fSKris Buschelman     int            nz,i,idt,ai16;
43962aa5897fSKris Buschelman     unsigned int   jdx,idx;
43972aa5897fSKris Buschelman     unsigned short *vi;
4398eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
43993660e330SKris Buschelman 
4400eb05f457SKris Buschelman     /* First block is the identity. */
44013660e330SKris Buschelman     idx = 0;
4402eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
44032aa5897fSKris Buschelman     v =  aa + 16*((unsigned int)ai[1]);
44043660e330SKris Buschelman 
44053660e330SKris Buschelman     for (i=1; i<n; ) {
44063660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
44073660e330SKris Buschelman       vi   =  aj      + ai[i];
44083660e330SKris Buschelman       nz   =  diag[i] - ai[i];
44093660e330SKris Buschelman       idx +=  4;
44103660e330SKris Buschelman 
4411eb05f457SKris Buschelman       /* Demote RHS from double to float. */
4412eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4413eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
44143660e330SKris Buschelman 
44153660e330SKris Buschelman       while (nz--) {
44163660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
44172aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
44183660e330SKris Buschelman 
44193660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
4420eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
44213660e330SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
44223660e330SKris Buschelman 
44233660e330SKris Buschelman         /* First Column */
44243660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM6)
44253660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
44263660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
44273660e330SKris Buschelman         SSE_SUB_PS(XMM7,XMM0)
44283660e330SKris Buschelman 
44293660e330SKris Buschelman         /* Second Column */
44303660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM6)
44313660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
44323660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
44333660e330SKris Buschelman         SSE_SUB_PS(XMM7,XMM1)
44343660e330SKris Buschelman 
44353660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
44363660e330SKris Buschelman 
44373660e330SKris Buschelman         /* Third Column */
44383660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM6)
44393660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
44403660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
44413660e330SKris Buschelman         SSE_SUB_PS(XMM7,XMM2)
44423660e330SKris Buschelman 
44433660e330SKris Buschelman         /* Fourth Column */
44443660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM6)
44453660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
44463660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
44473660e330SKris Buschelman         SSE_SUB_PS(XMM7,XMM3)
44483660e330SKris Buschelman         SSE_INLINE_END_2
44493660e330SKris Buschelman 
44503660e330SKris Buschelman         v += 16;
44513660e330SKris Buschelman       }
44523660e330SKris Buschelman       v =  aa + 16*ai[++i];
44533660e330SKris Buschelman       PREFETCH_NTA(v);
4454eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
44553660e330SKris Buschelman     }
4456eb05f457SKris Buschelman 
4457eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
4458eb05f457SKris Buschelman 
44593660e330SKris Buschelman     idt  = 4*(n-1);
44603660e330SKris Buschelman     ai16 = 16*diag[n-1];
44613660e330SKris Buschelman     v    = aa + ai16 + 16;
44623660e330SKris Buschelman     for (i=n-1; i>=0; ) {
44633660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
44643660e330SKris Buschelman       vi = aj + diag[i] + 1;
44653660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
44663660e330SKris Buschelman 
4467eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
44683660e330SKris Buschelman 
44693660e330SKris Buschelman       while (nz--) {
44703660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
44712aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
44723660e330SKris Buschelman 
44733660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
4474eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
44753660e330SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
44763660e330SKris Buschelman 
44773660e330SKris Buschelman         /* First Column */
44783660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM6)
44793660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
44803660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
44813660e330SKris Buschelman         SSE_SUB_PS(XMM7,XMM0)
44823660e330SKris Buschelman 
44833660e330SKris Buschelman         /* Second Column */
44843660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM6)
44853660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
44863660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
44873660e330SKris Buschelman         SSE_SUB_PS(XMM7,XMM1)
44883660e330SKris Buschelman 
44893660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
44903660e330SKris Buschelman 
44913660e330SKris Buschelman         /* Third Column */
44923660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM6)
44933660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
44943660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
44953660e330SKris Buschelman         SSE_SUB_PS(XMM7,XMM2)
44963660e330SKris Buschelman 
44973660e330SKris Buschelman         /* Fourth Column */
44983660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM6)
44993660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
45003660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
45013660e330SKris Buschelman         SSE_SUB_PS(XMM7,XMM3)
45023660e330SKris Buschelman         SSE_INLINE_END_2
45033660e330SKris Buschelman         v += 16;
45043660e330SKris Buschelman       }
45053660e330SKris Buschelman       v    = aa + ai16;
45063660e330SKris Buschelman       ai16 = 16*diag[--i];
45073660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
45083660e330SKris Buschelman       /*
45093660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
45103660e330SKris Buschelman          which was inverted as part of the factorization
45113660e330SKris Buschelman       */
4512eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
45133660e330SKris Buschelman       /* First Column */
45143660e330SKris Buschelman       SSE_COPY_PS(XMM0,XMM7)
45153660e330SKris Buschelman       SSE_SHUFFLE(XMM0,XMM0,0x00)
45163660e330SKris Buschelman       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
45173660e330SKris Buschelman 
45183660e330SKris Buschelman       /* Second Column */
45193660e330SKris Buschelman       SSE_COPY_PS(XMM1,XMM7)
45203660e330SKris Buschelman       SSE_SHUFFLE(XMM1,XMM1,0x55)
45213660e330SKris Buschelman       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
45223660e330SKris Buschelman       SSE_ADD_PS(XMM0,XMM1)
45233660e330SKris Buschelman 
45243660e330SKris Buschelman       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
45253660e330SKris Buschelman 
45263660e330SKris Buschelman       /* Third Column */
45273660e330SKris Buschelman       SSE_COPY_PS(XMM2,XMM7)
45283660e330SKris Buschelman       SSE_SHUFFLE(XMM2,XMM2,0xAA)
45293660e330SKris Buschelman       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
45303660e330SKris Buschelman       SSE_ADD_PS(XMM0,XMM2)
45313660e330SKris Buschelman 
45323660e330SKris Buschelman       /* Fourth Column */
45333660e330SKris Buschelman       SSE_COPY_PS(XMM3,XMM7)
45343660e330SKris Buschelman       SSE_SHUFFLE(XMM3,XMM3,0xFF)
45353660e330SKris Buschelman       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
45363660e330SKris Buschelman       SSE_ADD_PS(XMM0,XMM3)
45373660e330SKris Buschelman 
45383660e330SKris Buschelman       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
45393660e330SKris Buschelman       SSE_INLINE_END_3
45403660e330SKris Buschelman 
45413660e330SKris Buschelman       v    = aa + ai16 + 16;
45423660e330SKris Buschelman       idt -= 4;
45433660e330SKris Buschelman     }
4544eb05f457SKris Buschelman 
4545eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
4546eb05f457SKris Buschelman     idt = 4*(n-1);
4547eb05f457SKris Buschelman     for (i=n-1; i>=0; i--) {
4548eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4549eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4550eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
4551eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
4552eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
4553eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
4554eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
4555eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
455654693613SKris Buschelman       idt     -= 4;
45573660e330SKris Buschelman     }
4558eb05f457SKris Buschelman 
4559eb05f457SKris Buschelman   } /* End of artificial scope. */
45601ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
45611ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4562dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
45633660e330SKris Buschelman   SSE_SCOPE_END;
45643660e330SKris Buschelman   PetscFunctionReturn(0);
45653660e330SKris Buschelman }
45663660e330SKris Buschelman 
45677cf1b8d3SKris Buschelman #undef __FUNCT__
45687cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4569dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
45707cf1b8d3SKris Buschelman {
45717cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
45727cf1b8d3SKris Buschelman   int            *aj=a->j;
4573dfbe8321SBarry Smith   PetscErrorCode ierr;
4574dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
45757cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
45767cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
45777cf1b8d3SKris Buschelman 
45787cf1b8d3SKris Buschelman   PetscFunctionBegin;
45797cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
45807cf1b8d3SKris Buschelman   /*
45817cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
45827cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
45837cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
45847cf1b8d3SKris Buschelman   */
45857cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
45867cf1b8d3SKris Buschelman 
45871ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
45881ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
45897cf1b8d3SKris Buschelman   {
45907cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
45917cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar*)x;
45927cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
45937cf1b8d3SKris Buschelman     int       jdx,idx;
45947cf1b8d3SKris Buschelman     int       *vi;
45957cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
45967cf1b8d3SKris Buschelman 
45977cf1b8d3SKris Buschelman     /* First block is the identity. */
45987cf1b8d3SKris Buschelman     idx = 0;
45997cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
46007cf1b8d3SKris Buschelman     v =  aa + 16*ai[1];
46017cf1b8d3SKris Buschelman 
46027cf1b8d3SKris Buschelman     for (i=1; i<n; ) {
46037cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
46047cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
46057cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
46067cf1b8d3SKris Buschelman       idx +=  4;
46077cf1b8d3SKris Buschelman 
46087cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
46097cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
46107cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
46117cf1b8d3SKris Buschelman 
46127cf1b8d3SKris Buschelman       while (nz--) {
46137cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
46147cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
46157cf1b8d3SKris Buschelman /*          jdx = *vi++; */
46167cf1b8d3SKris Buschelman 
46177cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
46187cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
46197cf1b8d3SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
46207cf1b8d3SKris Buschelman 
46217cf1b8d3SKris Buschelman         /* First Column */
46227cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM6)
46237cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
46247cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
46257cf1b8d3SKris Buschelman         SSE_SUB_PS(XMM7,XMM0)
46267cf1b8d3SKris Buschelman 
46277cf1b8d3SKris Buschelman         /* Second Column */
46287cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM6)
46297cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
46307cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
46317cf1b8d3SKris Buschelman         SSE_SUB_PS(XMM7,XMM1)
46327cf1b8d3SKris Buschelman 
46337cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
46347cf1b8d3SKris Buschelman 
46357cf1b8d3SKris Buschelman         /* Third Column */
46367cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM6)
46377cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
46387cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
46397cf1b8d3SKris Buschelman         SSE_SUB_PS(XMM7,XMM2)
46407cf1b8d3SKris Buschelman 
46417cf1b8d3SKris Buschelman         /* Fourth Column */
46427cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM6)
46437cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
46447cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
46457cf1b8d3SKris Buschelman         SSE_SUB_PS(XMM7,XMM3)
46467cf1b8d3SKris Buschelman         SSE_INLINE_END_2
46477cf1b8d3SKris Buschelman 
46487cf1b8d3SKris Buschelman         v += 16;
46497cf1b8d3SKris Buschelman       }
46507cf1b8d3SKris Buschelman       v =  aa + 16*ai[++i];
46517cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
46527cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
46537cf1b8d3SKris Buschelman     }
46547cf1b8d3SKris Buschelman 
46557cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
46567cf1b8d3SKris Buschelman 
46577cf1b8d3SKris Buschelman     idt  = 4*(n-1);
46587cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
46597cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
46607cf1b8d3SKris Buschelman     for (i=n-1; i>=0; ) {
46617cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
46627cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
46637cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
46647cf1b8d3SKris Buschelman 
46657cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
46667cf1b8d3SKris Buschelman 
46677cf1b8d3SKris Buschelman       while (nz--) {
46687cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
46697cf1b8d3SKris Buschelman         idx = 4*(*vi++);
46707cf1b8d3SKris Buschelman /*          idx = *vi++; */
46717cf1b8d3SKris Buschelman 
46727cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
46737cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
46747cf1b8d3SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
46757cf1b8d3SKris Buschelman 
46767cf1b8d3SKris Buschelman         /* First Column */
46777cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM6)
46787cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
46797cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
46807cf1b8d3SKris Buschelman         SSE_SUB_PS(XMM7,XMM0)
46817cf1b8d3SKris Buschelman 
46827cf1b8d3SKris Buschelman         /* Second Column */
46837cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM6)
46847cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
46857cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
46867cf1b8d3SKris Buschelman         SSE_SUB_PS(XMM7,XMM1)
46877cf1b8d3SKris Buschelman 
46887cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
46897cf1b8d3SKris Buschelman 
46907cf1b8d3SKris Buschelman         /* Third Column */
46917cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM6)
46927cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
46937cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
46947cf1b8d3SKris Buschelman         SSE_SUB_PS(XMM7,XMM2)
46957cf1b8d3SKris Buschelman 
46967cf1b8d3SKris Buschelman         /* Fourth Column */
46977cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM6)
46987cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
46997cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
47007cf1b8d3SKris Buschelman         SSE_SUB_PS(XMM7,XMM3)
47017cf1b8d3SKris Buschelman         SSE_INLINE_END_2
47027cf1b8d3SKris Buschelman         v += 16;
47037cf1b8d3SKris Buschelman       }
47047cf1b8d3SKris Buschelman       v    = aa + ai16;
47057cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
47067cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
47077cf1b8d3SKris Buschelman       /*
47087cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
47097cf1b8d3SKris Buschelman          which was inverted as part of the factorization
47107cf1b8d3SKris Buschelman       */
47117cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
47127cf1b8d3SKris Buschelman       /* First Column */
47137cf1b8d3SKris Buschelman       SSE_COPY_PS(XMM0,XMM7)
47147cf1b8d3SKris Buschelman       SSE_SHUFFLE(XMM0,XMM0,0x00)
47157cf1b8d3SKris Buschelman       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
47167cf1b8d3SKris Buschelman 
47177cf1b8d3SKris Buschelman       /* Second Column */
47187cf1b8d3SKris Buschelman       SSE_COPY_PS(XMM1,XMM7)
47197cf1b8d3SKris Buschelman       SSE_SHUFFLE(XMM1,XMM1,0x55)
47207cf1b8d3SKris Buschelman       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
47217cf1b8d3SKris Buschelman       SSE_ADD_PS(XMM0,XMM1)
47227cf1b8d3SKris Buschelman 
47237cf1b8d3SKris Buschelman       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
47247cf1b8d3SKris Buschelman 
47257cf1b8d3SKris Buschelman       /* Third Column */
47267cf1b8d3SKris Buschelman       SSE_COPY_PS(XMM2,XMM7)
47277cf1b8d3SKris Buschelman       SSE_SHUFFLE(XMM2,XMM2,0xAA)
47287cf1b8d3SKris Buschelman       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
47297cf1b8d3SKris Buschelman       SSE_ADD_PS(XMM0,XMM2)
47307cf1b8d3SKris Buschelman 
47317cf1b8d3SKris Buschelman       /* Fourth Column */
47327cf1b8d3SKris Buschelman       SSE_COPY_PS(XMM3,XMM7)
47337cf1b8d3SKris Buschelman       SSE_SHUFFLE(XMM3,XMM3,0xFF)
47347cf1b8d3SKris Buschelman       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
47357cf1b8d3SKris Buschelman       SSE_ADD_PS(XMM0,XMM3)
47367cf1b8d3SKris Buschelman 
47377cf1b8d3SKris Buschelman       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
47387cf1b8d3SKris Buschelman       SSE_INLINE_END_3
47397cf1b8d3SKris Buschelman 
47407cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
47417cf1b8d3SKris Buschelman       idt -= 4;
47427cf1b8d3SKris Buschelman     }
47437cf1b8d3SKris Buschelman 
47447cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
47457cf1b8d3SKris Buschelman     idt = 4*(n-1);
47467cf1b8d3SKris Buschelman     for (i=n-1; i>=0; i--) {
47477cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
47487cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
47497cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
47507cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
47517cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
47527cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
47537cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
47547cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
47557cf1b8d3SKris Buschelman       idt     -= 4;
47567cf1b8d3SKris Buschelman     }
47577cf1b8d3SKris Buschelman 
47587cf1b8d3SKris Buschelman   } /* End of artificial scope. */
47591ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
47601ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4761dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
47627cf1b8d3SKris Buschelman   SSE_SCOPE_END;
47637cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
47647cf1b8d3SKris Buschelman }
47657cf1b8d3SKris Buschelman 
47663660e330SKris Buschelman #endif
47678f690400SShri Abhyankar 
47684a2ae208SSatish Balay #undef __FUNCT__
476906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
477006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
47714e2b4712SSatish Balay {
47724e2b4712SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
47734e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
47746849ba73SBarry Smith   PetscErrorCode    ierr;
4775b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4776b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
47775d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4778d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4779d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4780d9fead3dSBarry Smith   const PetscScalar *b;
47814e2b4712SSatish Balay 
47824e2b4712SSatish Balay   PetscFunctionBegin;
47833649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
47841ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4785f1af5d2fSBarry Smith   t    = a->solve_work;
47864e2b4712SSatish Balay 
47874e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
47884e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
47894e2b4712SSatish Balay 
47904e2b4712SSatish Balay   /* forward solve the lower triangular */
47914e2b4712SSatish Balay   idx  = 3*(*r++);
4792f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
47934e2b4712SSatish Balay   for (i=1; i<n; i++) {
47944e2b4712SSatish Balay     v   = aa + 9*ai[i];
47954e2b4712SSatish Balay     vi  = aj + ai[i];
47964e2b4712SSatish Balay     nz  = diag[i] - ai[i];
47974e2b4712SSatish Balay     idx = 3*(*r++);
4798f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
47994e2b4712SSatish Balay     while (nz--) {
48004e2b4712SSatish Balay       idx = 3*(*vi++);
4801f1af5d2fSBarry Smith       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4802f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4803f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4804f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48054e2b4712SSatish Balay       v  += 9;
48064e2b4712SSatish Balay     }
48074e2b4712SSatish Balay     idx    = 3*i;
4808f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
48094e2b4712SSatish Balay   }
48104e2b4712SSatish Balay   /* backward solve the upper triangular */
48114e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
48124e2b4712SSatish Balay     v   = aa + 9*diag[i] + 9;
48134e2b4712SSatish Balay     vi  = aj + diag[i] + 1;
48144e2b4712SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
48154e2b4712SSatish Balay     idt = 3*i;
4816f1af5d2fSBarry Smith     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
48174e2b4712SSatish Balay     while (nz--) {
48184e2b4712SSatish Balay       idx = 3*(*vi++);
4819f1af5d2fSBarry Smith       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4820f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4821f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4822f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48234e2b4712SSatish Balay       v  += 9;
48244e2b4712SSatish Balay     }
48254e2b4712SSatish Balay     idc      = 3*(*c--);
48264e2b4712SSatish Balay     v        = aa + 9*diag[i];
4827f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4828f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4829f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
48304e2b4712SSatish Balay   }
48314e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
48324e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
48333649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
48341ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4835dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
48364e2b4712SSatish Balay   PetscFunctionReturn(0);
48374e2b4712SSatish Balay }
48384e2b4712SSatish Balay 
48390c4413a7SShri Abhyankar #undef __FUNCT__
48404dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3"
48414dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
48420c4413a7SShri Abhyankar {
48430c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
48440c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
48450c4413a7SShri Abhyankar   PetscErrorCode    ierr;
4846b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4847b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
48480c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
48490c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
48500c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
48510c4413a7SShri Abhyankar   const PetscScalar *b;
48520c4413a7SShri Abhyankar 
48530c4413a7SShri Abhyankar   PetscFunctionBegin;
48543649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
48550c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
48560c4413a7SShri Abhyankar   t    = a->solve_work;
48570c4413a7SShri Abhyankar 
48580c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
48590c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
48600c4413a7SShri Abhyankar 
48610c4413a7SShri Abhyankar   /* forward solve the lower triangular */
48620c4413a7SShri Abhyankar   idx  = 3*r[0];
48630c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
48640c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
48650c4413a7SShri Abhyankar     v   = aa + 9*ai[i];
48660c4413a7SShri Abhyankar     vi  = aj + ai[i];
48670c4413a7SShri Abhyankar     nz  = ai[i+1] - ai[i];
48680c4413a7SShri Abhyankar     idx = 3*r[i];
48690c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
48700c4413a7SShri Abhyankar     for (m=0; m<nz; m++) {
48710c4413a7SShri Abhyankar       idx = 3*vi[m];
48720c4413a7SShri Abhyankar       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
48730c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
48740c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
48750c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48760c4413a7SShri Abhyankar       v  += 9;
48770c4413a7SShri Abhyankar     }
48780c4413a7SShri Abhyankar     idx    = 3*i;
48790c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
48800c4413a7SShri Abhyankar   }
48810c4413a7SShri Abhyankar   /* backward solve the upper triangular */
48820c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--) {
48830c4413a7SShri Abhyankar     v   = aa + 9*(adiag[i+1]+1);
48840c4413a7SShri Abhyankar     vi  = aj + adiag[i+1]+1;
48850c4413a7SShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
48860c4413a7SShri Abhyankar     idt = 3*i;
48870c4413a7SShri Abhyankar     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
48880c4413a7SShri Abhyankar     for (m=0; m<nz; m++) {
48890c4413a7SShri Abhyankar       idx = 3*vi[m];
48900c4413a7SShri Abhyankar       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
48910c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
48920c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
48930c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48940c4413a7SShri Abhyankar       v  += 9;
48950c4413a7SShri Abhyankar     }
48960c4413a7SShri Abhyankar     idc      = 3*c[i];
48970c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
48980c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
48990c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
49000c4413a7SShri Abhyankar   }
49010c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
49020c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
49033649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
49040c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
49050c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
49060c4413a7SShri Abhyankar   PetscFunctionReturn(0);
49070c4413a7SShri Abhyankar }
49080c4413a7SShri Abhyankar 
490915091d37SBarry Smith /*
491015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
491115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
491215091d37SBarry Smith */
49134a2ae208SSatish Balay #undef __FUNCT__
491406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
491506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
491615091d37SBarry Smith {
491715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
49180b68f018SBarry Smith   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j;
4919dfbe8321SBarry Smith   PetscErrorCode    ierr;
49200b68f018SBarry Smith   const PetscInt    *diag = a->diag,*vi;
4921d9fead3dSBarry Smith   const MatScalar   *aa   =a->a,*v;
4922d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4923d9fead3dSBarry Smith   const PetscScalar *b;
49240b68f018SBarry Smith   PetscInt          jdx,idt,idx,nz,i;
492515091d37SBarry Smith 
492615091d37SBarry Smith   PetscFunctionBegin;
49273649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
49281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
492915091d37SBarry Smith 
493015091d37SBarry Smith   /* forward solve the lower triangular */
493115091d37SBarry Smith   idx  = 0;
493215091d37SBarry Smith   x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
493315091d37SBarry Smith   for (i=1; i<n; i++) {
493415091d37SBarry Smith     v    =  aa      + 9*ai[i];
493515091d37SBarry Smith     vi   =  aj      + ai[i];
493615091d37SBarry Smith     nz   =  diag[i] - ai[i];
493715091d37SBarry Smith     idx +=  3;
4938f1af5d2fSBarry Smith     s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
493915091d37SBarry Smith     while (nz--) {
494015091d37SBarry Smith       jdx = 3*(*vi++);
494115091d37SBarry Smith       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4942f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4943f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4944f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
494515091d37SBarry Smith       v  += 9;
494615091d37SBarry Smith     }
4947f1af5d2fSBarry Smith     x[idx]   = s1;
4948f1af5d2fSBarry Smith     x[1+idx] = s2;
4949f1af5d2fSBarry Smith     x[2+idx] = s3;
495015091d37SBarry Smith   }
495115091d37SBarry Smith   /* backward solve the upper triangular */
495215091d37SBarry Smith   for (i=n-1; i>=0; i--) {
495315091d37SBarry Smith     v   = aa + 9*diag[i] + 9;
495415091d37SBarry Smith     vi  = aj + diag[i] + 1;
495515091d37SBarry Smith     nz  = ai[i+1] - diag[i] - 1;
495615091d37SBarry Smith     idt = 3*i;
4957f1af5d2fSBarry Smith     s1  = x[idt];  s2 = x[1+idt];
4958f1af5d2fSBarry Smith     s3  = x[2+idt];
495915091d37SBarry Smith     while (nz--) {
496015091d37SBarry Smith       idx = 3*(*vi++);
496115091d37SBarry Smith       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4962f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4963f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4964f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
496515091d37SBarry Smith       v  += 9;
496615091d37SBarry Smith     }
496715091d37SBarry Smith     v        = aa +  9*diag[i];
4968f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4969f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4970f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
497115091d37SBarry Smith   }
497215091d37SBarry Smith 
49733649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
49741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4975dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
497615091d37SBarry Smith   PetscFunctionReturn(0);
497715091d37SBarry Smith }
497815091d37SBarry Smith 
4979cee9d6f2SShri Abhyankar #undef __FUNCT__
49804dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
49814dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4982b2b2dd24SShri Abhyankar {
4983b2b2dd24SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4984b3260449SShri Abhyankar   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4985b2b2dd24SShri Abhyankar   PetscErrorCode    ierr;
4986b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,jdx,idt;
4987b3260449SShri Abhyankar   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4988b2b2dd24SShri Abhyankar   const MatScalar   *aa=a->a,*v;
4989b2b2dd24SShri Abhyankar   PetscScalar       *x;
4990b2b2dd24SShri Abhyankar   const PetscScalar *b;
4991b2b2dd24SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3;
4992b2b2dd24SShri Abhyankar 
4993b2b2dd24SShri Abhyankar   PetscFunctionBegin;
49943649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4995b2b2dd24SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4996b2b2dd24SShri Abhyankar   /* forward solve the lower triangular */
4997b2b2dd24SShri Abhyankar   idx  = 0;
4998b2b2dd24SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4999b2b2dd24SShri Abhyankar   for (i=1; i<n; i++) {
5000b2b2dd24SShri Abhyankar     v   = aa + bs2*ai[i];
5001b2b2dd24SShri Abhyankar     vi  = aj + ai[i];
5002b2b2dd24SShri Abhyankar     nz  = ai[i+1] - ai[i];
5003b2b2dd24SShri Abhyankar     idx = bs*i;
5004b2b2dd24SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5005b2b2dd24SShri Abhyankar     for (k=0; k<nz; k++) {
5006b2b2dd24SShri Abhyankar       jdx = bs*vi[k];
5007b2b2dd24SShri Abhyankar       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5008b2b2dd24SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5009b2b2dd24SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5010b2b2dd24SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5011b2b2dd24SShri Abhyankar 
5012b2b2dd24SShri Abhyankar       v +=  bs2;
5013b2b2dd24SShri Abhyankar     }
5014b2b2dd24SShri Abhyankar 
5015b2b2dd24SShri Abhyankar     x[idx]   = s1;
5016b2b2dd24SShri Abhyankar     x[1+idx] = s2;
5017b2b2dd24SShri Abhyankar     x[2+idx] = s3;
5018b2b2dd24SShri Abhyankar   }
5019b2b2dd24SShri Abhyankar 
5020b2b2dd24SShri Abhyankar   /* backward solve the upper triangular */
5021b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--) {
5022b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
5023b2b2dd24SShri Abhyankar     vi  = aj + adiag[i+1]+1;
5024b2b2dd24SShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
5025b2b2dd24SShri Abhyankar     idt = bs*i;
5026b2b2dd24SShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5027b2b2dd24SShri Abhyankar 
5028b2b2dd24SShri Abhyankar     for (k=0; k<nz; k++) {
5029b2b2dd24SShri Abhyankar       idx = bs*vi[k];
5030b2b2dd24SShri Abhyankar       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5031b2b2dd24SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5032b2b2dd24SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5033b2b2dd24SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5034b2b2dd24SShri Abhyankar 
5035b2b2dd24SShri Abhyankar       v +=  bs2;
5036b2b2dd24SShri Abhyankar     }
5037b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5038b2b2dd24SShri Abhyankar     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5039b2b2dd24SShri Abhyankar     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5040b2b2dd24SShri Abhyankar     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5041b2b2dd24SShri Abhyankar 
5042b2b2dd24SShri Abhyankar   }
5043b2b2dd24SShri Abhyankar 
50443649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5045b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5046b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5047b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5048b2b2dd24SShri Abhyankar }
5049b2b2dd24SShri Abhyankar 
5050b2b2dd24SShri Abhyankar #undef __FUNCT__
505106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
505206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
50534e2b4712SSatish Balay {
50544e2b4712SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
50554e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
50566849ba73SBarry Smith   PetscErrorCode    ierr;
5057b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5058b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
50595d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5060d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5061d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
5062d9fead3dSBarry Smith   const PetscScalar *b;
50634e2b4712SSatish Balay 
50644e2b4712SSatish Balay   PetscFunctionBegin;
50653649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
50661ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5067f1af5d2fSBarry Smith   t    = a->solve_work;
50684e2b4712SSatish Balay 
50694e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
50704e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
50714e2b4712SSatish Balay 
50724e2b4712SSatish Balay   /* forward solve the lower triangular */
50734e2b4712SSatish Balay   idx  = 2*(*r++);
5074f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
50754e2b4712SSatish Balay   for (i=1; i<n; i++) {
50764e2b4712SSatish Balay     v   = aa + 4*ai[i];
50774e2b4712SSatish Balay     vi  = aj + ai[i];
50784e2b4712SSatish Balay     nz  = diag[i] - ai[i];
50794e2b4712SSatish Balay     idx = 2*(*r++);
5080f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
50814e2b4712SSatish Balay     while (nz--) {
50824e2b4712SSatish Balay       idx = 2*(*vi++);
5083f1af5d2fSBarry Smith       x1  = t[idx]; x2 = t[1+idx];
5084f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5085f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
50864e2b4712SSatish Balay       v  += 4;
50874e2b4712SSatish Balay     }
50884e2b4712SSatish Balay     idx    = 2*i;
5089f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
50904e2b4712SSatish Balay   }
50914e2b4712SSatish Balay   /* backward solve the upper triangular */
50924e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
50934e2b4712SSatish Balay     v   = aa + 4*diag[i] + 4;
50944e2b4712SSatish Balay     vi  = aj + diag[i] + 1;
50954e2b4712SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
50964e2b4712SSatish Balay     idt = 2*i;
5097f1af5d2fSBarry Smith     s1  = t[idt]; s2 = t[1+idt];
50984e2b4712SSatish Balay     while (nz--) {
50994e2b4712SSatish Balay       idx = 2*(*vi++);
5100f1af5d2fSBarry Smith       x1  = t[idx]; x2 = t[1+idx];
5101f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5102f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
51034e2b4712SSatish Balay       v  += 4;
51044e2b4712SSatish Balay     }
51054e2b4712SSatish Balay     idc      = 2*(*c--);
51064e2b4712SSatish Balay     v        = aa + 4*diag[i];
5107f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5108f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
51094e2b4712SSatish Balay   }
51104e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
51114e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
51123649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
51131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5114dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
51154e2b4712SSatish Balay   PetscFunctionReturn(0);
51164e2b4712SSatish Balay }
51174e2b4712SSatish Balay 
51180c4413a7SShri Abhyankar #undef __FUNCT__
51194dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2"
51204dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
51210c4413a7SShri Abhyankar {
51220c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
51230c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
51240c4413a7SShri Abhyankar   PetscErrorCode    ierr;
5125b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5126b3260449SShri Abhyankar   PetscInt          i,nz,idx,jdx,idt,idc,m;
51270c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
51280c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
51290c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
51300c4413a7SShri Abhyankar   const PetscScalar *b;
51310c4413a7SShri Abhyankar 
51320c4413a7SShri Abhyankar   PetscFunctionBegin;
51333649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
51340c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
51350c4413a7SShri Abhyankar   t    = a->solve_work;
51360c4413a7SShri Abhyankar 
51370c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
51380c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
51390c4413a7SShri Abhyankar 
51400c4413a7SShri Abhyankar   /* forward solve the lower triangular */
51410c4413a7SShri Abhyankar   idx  = 2*r[0];
51420c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
51430c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
51440c4413a7SShri Abhyankar     v   = aa + 4*ai[i];
51450c4413a7SShri Abhyankar     vi  = aj + ai[i];
51460c4413a7SShri Abhyankar     nz  = ai[i+1] - ai[i];
51470c4413a7SShri Abhyankar     idx = 2*r[i];
51480c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
51490c4413a7SShri Abhyankar     for (m=0; m<nz; m++) {
51500c4413a7SShri Abhyankar       jdx = 2*vi[m];
51510c4413a7SShri Abhyankar       x1  = t[jdx]; x2 = t[1+jdx];
51520c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
51530c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
51540c4413a7SShri Abhyankar       v  += 4;
51550c4413a7SShri Abhyankar     }
51560c4413a7SShri Abhyankar     idx    = 2*i;
51570c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
51580c4413a7SShri Abhyankar   }
51590c4413a7SShri Abhyankar   /* backward solve the upper triangular */
51600c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--) {
51610c4413a7SShri Abhyankar     v   = aa + 4*(adiag[i+1]+1);
51620c4413a7SShri Abhyankar     vi  = aj + adiag[i+1]+1;
51630c4413a7SShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
51640c4413a7SShri Abhyankar     idt = 2*i;
51650c4413a7SShri Abhyankar     s1  = t[idt]; s2 = t[1+idt];
51660c4413a7SShri Abhyankar     for (m=0; m<nz; m++) {
51670c4413a7SShri Abhyankar       idx = 2*vi[m];
51680c4413a7SShri Abhyankar       x1  = t[idx]; x2 = t[1+idx];
51690c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
51700c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
51710c4413a7SShri Abhyankar       v  += 4;
51720c4413a7SShri Abhyankar     }
51730c4413a7SShri Abhyankar     idc      = 2*c[i];
51740c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
51750c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
51760c4413a7SShri Abhyankar   }
51770c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
51780c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
51793649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
51800c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51810c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
51820c4413a7SShri Abhyankar   PetscFunctionReturn(0);
51830c4413a7SShri Abhyankar }
51848f690400SShri Abhyankar 
518515091d37SBarry Smith /*
518615091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
518715091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
518815091d37SBarry Smith */
51894a2ae208SSatish Balay #undef __FUNCT__
519006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
519106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
519215091d37SBarry Smith {
519315091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5194b3260449SShri Abhyankar   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5195dfbe8321SBarry Smith   PetscErrorCode    ierr;
5196d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5197d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
5198d9fead3dSBarry Smith   const PetscScalar *b;
5199b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
520015091d37SBarry Smith 
520115091d37SBarry Smith   PetscFunctionBegin;
52023649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
52031ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
520415091d37SBarry Smith 
520515091d37SBarry Smith   /* forward solve the lower triangular */
520615091d37SBarry Smith   idx  = 0;
520715091d37SBarry Smith   x[0] = b[0]; x[1] = b[1];
520815091d37SBarry Smith   for (i=1; i<n; i++) {
520915091d37SBarry Smith     v    =  aa      + 4*ai[i];
521015091d37SBarry Smith     vi   =  aj      + ai[i];
521115091d37SBarry Smith     nz   =  diag[i] - ai[i];
521215091d37SBarry Smith     idx +=  2;
5213f1af5d2fSBarry Smith     s1   =  b[idx];s2 = b[1+idx];
521415091d37SBarry Smith     while (nz--) {
521515091d37SBarry Smith       jdx = 2*(*vi++);
521615091d37SBarry Smith       x1  = x[jdx];x2 = x[1+jdx];
5217f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5218f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
521915091d37SBarry Smith       v  += 4;
522015091d37SBarry Smith     }
5221f1af5d2fSBarry Smith     x[idx]   = s1;
5222f1af5d2fSBarry Smith     x[1+idx] = s2;
522315091d37SBarry Smith   }
522415091d37SBarry Smith   /* backward solve the upper triangular */
522515091d37SBarry Smith   for (i=n-1; i>=0; i--) {
522615091d37SBarry Smith     v   = aa + 4*diag[i] + 4;
522715091d37SBarry Smith     vi  = aj + diag[i] + 1;
522815091d37SBarry Smith     nz  = ai[i+1] - diag[i] - 1;
522915091d37SBarry Smith     idt = 2*i;
5230f1af5d2fSBarry Smith     s1  = x[idt];  s2 = x[1+idt];
523115091d37SBarry Smith     while (nz--) {
523215091d37SBarry Smith       idx = 2*(*vi++);
523315091d37SBarry Smith       x1  = x[idx];   x2 = x[1+idx];
5234f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5235f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
523615091d37SBarry Smith       v  += 4;
523715091d37SBarry Smith     }
523815091d37SBarry Smith     v        = aa +  4*diag[i];
5239f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
5240f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
524115091d37SBarry Smith   }
524215091d37SBarry Smith 
52433649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
52441ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5245dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
524615091d37SBarry Smith   PetscFunctionReturn(0);
524715091d37SBarry Smith }
524815091d37SBarry Smith 
5249cee9d6f2SShri Abhyankar #undef __FUNCT__
52504dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
52514dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5252b2b2dd24SShri Abhyankar {
5253b2b2dd24SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5254b3260449SShri Abhyankar   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5255b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,idt,jdx;
5256b2b2dd24SShri Abhyankar   PetscErrorCode    ierr;
5257b2b2dd24SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5258b2b2dd24SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2;
5259b2b2dd24SShri Abhyankar   const PetscScalar *b;
5260b2b2dd24SShri Abhyankar 
5261b2b2dd24SShri Abhyankar   PetscFunctionBegin;
52623649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5263b2b2dd24SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5264b2b2dd24SShri Abhyankar   /* forward solve the lower triangular */
5265b2b2dd24SShri Abhyankar   idx  = 0;
5266b2b2dd24SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx];
5267b2b2dd24SShri Abhyankar   for (i=1; i<n; i++) {
5268b2b2dd24SShri Abhyankar     v   = aa + 4*ai[i];
5269b2b2dd24SShri Abhyankar     vi  = aj + ai[i];
5270b2b2dd24SShri Abhyankar     nz  = ai[i+1] - ai[i];
5271b2b2dd24SShri Abhyankar     idx = 2*i;
5272b2b2dd24SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];
52734c0dbd8dSJed Brown     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
52744c0dbd8dSJed Brown     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5275b2b2dd24SShri Abhyankar     for (k=0; k<nz; k++) {
5276b2b2dd24SShri Abhyankar       jdx = 2*vi[k];
5277b2b2dd24SShri Abhyankar       x1  = x[jdx];x2 = x[1+jdx];
5278b2b2dd24SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
5279b2b2dd24SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
5280b2b2dd24SShri Abhyankar       v  +=  4;
5281b2b2dd24SShri Abhyankar     }
5282b2b2dd24SShri Abhyankar     x[idx]   = s1;
5283b2b2dd24SShri Abhyankar     x[1+idx] = s2;
5284b2b2dd24SShri Abhyankar   }
5285b2b2dd24SShri Abhyankar 
5286b2b2dd24SShri Abhyankar   /* backward solve the upper triangular */
5287b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--) {
5288b2b2dd24SShri Abhyankar     v   = aa + 4*(adiag[i+1]+1);
5289b2b2dd24SShri Abhyankar     vi  = aj + adiag[i+1]+1;
5290b2b2dd24SShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
5291b2b2dd24SShri Abhyankar     idt = 2*i;
5292b2b2dd24SShri Abhyankar     s1  = x[idt];  s2 = x[1+idt];
52934c0dbd8dSJed Brown     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
52944c0dbd8dSJed Brown     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5295b2b2dd24SShri Abhyankar     for (k=0; k<nz; k++) {
5296b2b2dd24SShri Abhyankar       idx = 2*vi[k];
5297b2b2dd24SShri Abhyankar       x1  = x[idx];   x2 = x[1+idx];
5298b2b2dd24SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
5299b2b2dd24SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
5300b2b2dd24SShri Abhyankar       v  += 4;
5301b2b2dd24SShri Abhyankar     }
5302b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5303b2b2dd24SShri Abhyankar     x[idt]   = v[0]*s1 + v[2]*s2;
5304b2b2dd24SShri Abhyankar     x[1+idt] = v[1]*s1 + v[3]*s2;
5305b2b2dd24SShri Abhyankar   }
5306b2b2dd24SShri Abhyankar 
53073649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5308b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5309b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5310b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5311b2b2dd24SShri Abhyankar }
5312b2b2dd24SShri Abhyankar 
5313b2b2dd24SShri Abhyankar #undef __FUNCT__
531406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
531506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
53164e2b4712SSatish Balay {
53174e2b4712SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
53184e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
53196849ba73SBarry Smith   PetscErrorCode    ierr;
5320b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5321b3260449SShri Abhyankar   PetscInt          i,nz;
53225d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5323b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5324b3260449SShri Abhyankar   PetscScalar       *x,s1,*t;
5325b3260449SShri Abhyankar   const PetscScalar *b;
53264e2b4712SSatish Balay 
53274e2b4712SSatish Balay   PetscFunctionBegin;
53284e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
53294e2b4712SSatish Balay 
53303649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
53311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5332f1af5d2fSBarry Smith   t    = a->solve_work;
53334e2b4712SSatish Balay 
53344e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
53354e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
53364e2b4712SSatish Balay 
53374e2b4712SSatish Balay   /* forward solve the lower triangular */
5338f1af5d2fSBarry Smith   t[0] = b[*r++];
53394e2b4712SSatish Balay   for (i=1; i<n; i++) {
53404e2b4712SSatish Balay     v  = aa + ai[i];
53414e2b4712SSatish Balay     vi = aj + ai[i];
53424e2b4712SSatish Balay     nz = diag[i] - ai[i];
5343f1af5d2fSBarry Smith     s1 = b[*r++];
53444e2b4712SSatish Balay     while (nz--) {
5345f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53464e2b4712SSatish Balay     }
5347f1af5d2fSBarry Smith     t[i] = s1;
53484e2b4712SSatish Balay   }
53494e2b4712SSatish Balay   /* backward solve the upper triangular */
53504e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
53514e2b4712SSatish Balay     v  = aa + diag[i] + 1;
53524e2b4712SSatish Balay     vi = aj + diag[i] + 1;
53534e2b4712SSatish Balay     nz = ai[i+1] - diag[i] - 1;
5354f1af5d2fSBarry Smith     s1 = t[i];
53554e2b4712SSatish Balay     while (nz--) {
5356f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53574e2b4712SSatish Balay     }
5358f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
53594e2b4712SSatish Balay   }
53604e2b4712SSatish Balay 
53614e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
53624e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
53633649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
53641ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5365dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53664e2b4712SSatish Balay   PetscFunctionReturn(0);
53674e2b4712SSatish Balay }
5368048b5e81SShri Abhyankar 
5369048b5e81SShri Abhyankar #undef __FUNCT__
5370048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5371048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5372048b5e81SShri Abhyankar {
5373048b5e81SShri Abhyankar   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
5374048b5e81SShri Abhyankar   IS                iscol = a->col,isrow = a->row;
5375048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5376048b5e81SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5377048b5e81SShri Abhyankar   const PetscInt    *rout,*cout,*r,*c;
5378048b5e81SShri Abhyankar   PetscScalar       *x,*tmp,sum;
5379048b5e81SShri Abhyankar   const PetscScalar *b;
5380048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5381048b5e81SShri Abhyankar 
5382048b5e81SShri Abhyankar   PetscFunctionBegin;
5383048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5384048b5e81SShri Abhyankar 
53853649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5386048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5387048b5e81SShri Abhyankar   tmp  = a->solve_work;
5388048b5e81SShri Abhyankar 
5389048b5e81SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5390048b5e81SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5391048b5e81SShri Abhyankar 
5392048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5393048b5e81SShri Abhyankar   tmp[0] = b[r[0]];
5394048b5e81SShri Abhyankar   v      = aa;
5395048b5e81SShri Abhyankar   vi     = aj;
5396048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5397048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5398048b5e81SShri Abhyankar     sum = b[r[i]];
5399048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5400048b5e81SShri Abhyankar     tmp[i] = sum;
5401048b5e81SShri Abhyankar     v     += nz; vi += nz;
5402048b5e81SShri Abhyankar   }
5403048b5e81SShri Abhyankar 
5404048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5405048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--) {
5406048b5e81SShri Abhyankar     v   = aa + adiag[i+1]+1;
5407048b5e81SShri Abhyankar     vi  = aj + adiag[i+1]+1;
5408048b5e81SShri Abhyankar     nz  = adiag[i]-adiag[i+1]-1;
5409048b5e81SShri Abhyankar     sum = tmp[i];
5410048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5411048b5e81SShri Abhyankar     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5412048b5e81SShri Abhyankar   }
5413048b5e81SShri Abhyankar 
5414048b5e81SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5415048b5e81SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
54163649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5417048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5418048b5e81SShri Abhyankar   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5419048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5420048b5e81SShri Abhyankar }
5421048b5e81SShri Abhyankar 
542215091d37SBarry Smith /*
542315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
542415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
542515091d37SBarry Smith */
54264a2ae208SSatish Balay #undef __FUNCT__
542706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
542806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
542915091d37SBarry Smith {
543015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5431b3260449SShri Abhyankar   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5432dfbe8321SBarry Smith   PetscErrorCode    ierr;
5433b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5434b3260449SShri Abhyankar   PetscScalar       *x;
5435b3260449SShri Abhyankar   const PetscScalar *b;
543687828ca2SBarry Smith   PetscScalar       s1,x1;
5437b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
543815091d37SBarry Smith 
543915091d37SBarry Smith   PetscFunctionBegin;
54403649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
54411ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544215091d37SBarry Smith 
544315091d37SBarry Smith   /* forward solve the lower triangular */
544415091d37SBarry Smith   idx  = 0;
544515091d37SBarry Smith   x[0] = b[0];
544615091d37SBarry Smith   for (i=1; i<n; i++) {
544715091d37SBarry Smith     v    =  aa      + ai[i];
544815091d37SBarry Smith     vi   =  aj      + ai[i];
544915091d37SBarry Smith     nz   =  diag[i] - ai[i];
545015091d37SBarry Smith     idx +=  1;
5451f1af5d2fSBarry Smith     s1   =  b[idx];
545215091d37SBarry Smith     while (nz--) {
545315091d37SBarry Smith       jdx = *vi++;
545415091d37SBarry Smith       x1  = x[jdx];
5455f1af5d2fSBarry Smith       s1 -= v[0]*x1;
545615091d37SBarry Smith       v  += 1;
545715091d37SBarry Smith     }
5458f1af5d2fSBarry Smith     x[idx] = s1;
545915091d37SBarry Smith   }
546015091d37SBarry Smith   /* backward solve the upper triangular */
546115091d37SBarry Smith   for (i=n-1; i>=0; i--) {
546215091d37SBarry Smith     v   = aa + diag[i] + 1;
546315091d37SBarry Smith     vi  = aj + diag[i] + 1;
546415091d37SBarry Smith     nz  = ai[i+1] - diag[i] - 1;
546515091d37SBarry Smith     idt = i;
5466f1af5d2fSBarry Smith     s1  = x[idt];
546715091d37SBarry Smith     while (nz--) {
546815091d37SBarry Smith       idx = *vi++;
546915091d37SBarry Smith       x1  = x[idx];
5470f1af5d2fSBarry Smith       s1 -= v[0]*x1;
547115091d37SBarry Smith       v  += 1;
547215091d37SBarry Smith     }
547315091d37SBarry Smith     v      = aa +  diag[i];
5474f1af5d2fSBarry Smith     x[idt] = v[0]*s1;
547515091d37SBarry Smith   }
54763649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
54771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5478dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
547915091d37SBarry Smith   PetscFunctionReturn(0);
548015091d37SBarry Smith }
54814e2b4712SSatish Balay 
5482048b5e81SShri Abhyankar 
5483048b5e81SShri Abhyankar #undef __FUNCT__
5484048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5485048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5486048b5e81SShri Abhyankar {
5487048b5e81SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5488048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5489048b5e81SShri Abhyankar   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5490048b5e81SShri Abhyankar   PetscScalar       *x,sum;
5491048b5e81SShri Abhyankar   const PetscScalar *b;
5492048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5493048b5e81SShri Abhyankar   PetscInt          i,nz;
5494048b5e81SShri Abhyankar 
5495048b5e81SShri Abhyankar   PetscFunctionBegin;
5496048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5497048b5e81SShri Abhyankar 
54983649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5499048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5500048b5e81SShri Abhyankar 
5501048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5502048b5e81SShri Abhyankar   x[0] = b[0];
5503048b5e81SShri Abhyankar   v    = aa;
5504048b5e81SShri Abhyankar   vi   = aj;
5505048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5506048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5507048b5e81SShri Abhyankar     sum = b[i];
5508048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5509048b5e81SShri Abhyankar     v   += nz;
5510048b5e81SShri Abhyankar     vi  += nz;
5511048b5e81SShri Abhyankar     x[i] = sum;
5512048b5e81SShri Abhyankar   }
5513048b5e81SShri Abhyankar 
5514048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5515048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--) {
5516048b5e81SShri Abhyankar     v   = aa + adiag[i+1] + 1;
5517048b5e81SShri Abhyankar     vi  = aj + adiag[i+1] + 1;
5518048b5e81SShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
5519048b5e81SShri Abhyankar     sum = x[i];
5520048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5521048b5e81SShri Abhyankar     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5522048b5e81SShri Abhyankar   }
5523048b5e81SShri Abhyankar 
5524048b5e81SShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
55253649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5526048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5527048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5528048b5e81SShri Abhyankar }
5529048b5e81SShri Abhyankar 
55304e2b4712SSatish Balay /* ----------------------------------------------------------------*/
553109573ac7SBarry Smith extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool);
55326bce7ff8SHong Zhang 
55332b0b2ea7SShri Abhyankar #undef __FUNCT__
553429a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5535766f9fbaSBarry Smith /*
5536766f9fbaSBarry Smith    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5537766f9fbaSBarry Smith */
553829a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
55392b0b2ea7SShri Abhyankar {
55402b0b2ea7SShri Abhyankar   Mat             C =B;
55412b0b2ea7SShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
55422b0b2ea7SShri Abhyankar   PetscErrorCode  ierr;
5543766f9fbaSBarry Smith   PetscInt        i,j,k,ipvt[15];
5544766f9fbaSBarry Smith   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5545766f9fbaSBarry Smith   PetscInt        nz,nzL,row;
5546766f9fbaSBarry Smith   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5547766f9fbaSBarry Smith   const MatScalar *v,*aa=a->a;
55482b0b2ea7SShri Abhyankar   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
55490fa040f9SShri Abhyankar   PetscInt        sol_ver;
55502b0b2ea7SShri Abhyankar 
55512b0b2ea7SShri Abhyankar   PetscFunctionBegin;
5552c55dd799SBarry Smith   ierr = PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
55530fa040f9SShri Abhyankar 
55542b0b2ea7SShri Abhyankar   /* generate work space needed by the factorization */
55552b0b2ea7SShri Abhyankar   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
55562b0b2ea7SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
55572b0b2ea7SShri Abhyankar 
55582b0b2ea7SShri Abhyankar   for (i=0; i<n; i++) {
55592b0b2ea7SShri Abhyankar     /* zero rtmp */
55602b0b2ea7SShri Abhyankar     /* L part */
55612b0b2ea7SShri Abhyankar     nz    = bi[i+1] - bi[i];
55622b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
55632b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++) {
55642b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
55652b0b2ea7SShri Abhyankar     }
55662b0b2ea7SShri Abhyankar 
55672b0b2ea7SShri Abhyankar     /* U part */
55682b0b2ea7SShri Abhyankar     nz    = bdiag[i] - bdiag[i+1];
55692b0b2ea7SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
55702b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++) {
55712b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
55722b0b2ea7SShri Abhyankar     }
55732b0b2ea7SShri Abhyankar 
55742b0b2ea7SShri Abhyankar     /* load in initial (unfactored row) */
557529a97285SShri Abhyankar     nz    = ai[i+1] - ai[i];
557629a97285SShri Abhyankar     ajtmp = aj + ai[i];
557729a97285SShri Abhyankar     v     = aa + bs2*ai[i];
55782b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
557929a97285SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
55802b0b2ea7SShri Abhyankar     }
55812b0b2ea7SShri Abhyankar 
55822b0b2ea7SShri Abhyankar     /* elimination */
55832b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
55842b0b2ea7SShri Abhyankar     nzL   = bi[i+1] - bi[i];
55852b0b2ea7SShri Abhyankar     for (k=0; k < nzL; k++) {
55862b0b2ea7SShri Abhyankar       row = bjtmp[k];
55872b0b2ea7SShri Abhyankar       pc  = rtmp + bs2*row;
5588c35f09e5SBarry Smith       for (flg=0,j=0; j<bs2; j++) {
5589c35f09e5SBarry Smith         if (pc[j]!=0.0) {
5590c35f09e5SBarry Smith           flg = 1;
5591c35f09e5SBarry Smith           break;
5592c35f09e5SBarry Smith         }
5593c35f09e5SBarry Smith       }
55942b0b2ea7SShri Abhyankar       if (flg) {
55952b0b2ea7SShri Abhyankar         pv = b->a + bs2*bdiag[row];
559696b95a6bSBarry Smith         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
559796b95a6bSBarry Smith         /*ierr = PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
55982b0b2ea7SShri Abhyankar         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
55992b0b2ea7SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
56002b0b2ea7SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
56012b0b2ea7SShri Abhyankar         for (j=0; j<nz; j++) {
5602766f9fbaSBarry Smith           vv = rtmp + bs2*pj[j];
560396b95a6bSBarry Smith           PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
560496b95a6bSBarry Smith           /* ierr = PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
56052b0b2ea7SShri Abhyankar           pv += bs2;
56062b0b2ea7SShri Abhyankar         }
5607766f9fbaSBarry Smith         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
56082b0b2ea7SShri Abhyankar       }
56092b0b2ea7SShri Abhyankar     }
56102b0b2ea7SShri Abhyankar 
56112b0b2ea7SShri Abhyankar     /* finished row so stick it into b->a */
56122b0b2ea7SShri Abhyankar     /* L part */
56132b0b2ea7SShri Abhyankar     pv = b->a + bs2*bi[i];
56142b0b2ea7SShri Abhyankar     pj = b->j + bi[i];
56152b0b2ea7SShri Abhyankar     nz = bi[i+1] - bi[i];
56162b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
56172b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56182b0b2ea7SShri Abhyankar     }
56192b0b2ea7SShri Abhyankar 
56202b0b2ea7SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
56212b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
56222b0b2ea7SShri Abhyankar     pj   = b->j + bdiag[i];
56232b0b2ea7SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
562496b95a6bSBarry Smith     /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */
562596b95a6bSBarry Smith     ierr = PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
56262b0b2ea7SShri Abhyankar 
56272b0b2ea7SShri Abhyankar     /* U part */
56282b0b2ea7SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
56292b0b2ea7SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
56302b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
56312b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
56322b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56332b0b2ea7SShri Abhyankar     }
56342b0b2ea7SShri Abhyankar   }
56352b0b2ea7SShri Abhyankar 
56362b0b2ea7SShri Abhyankar   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5637*26fbe8dcSKarl Rupp 
5638832cc040SShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5639766f9fbaSBarry Smith   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
56402b0b2ea7SShri Abhyankar   C->assembled           = PETSC_TRUE;
5641*26fbe8dcSKarl Rupp 
5642766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
56432b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
56442b0b2ea7SShri Abhyankar }
56452b0b2ea7SShri Abhyankar 
56466bce7ff8SHong Zhang #undef __FUNCT__
56474dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
56484dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
56496bce7ff8SHong Zhang {
56506bce7ff8SHong Zhang   Mat            C     =B;
56516bce7ff8SHong Zhang   Mat_SeqBAIJ    *a    =(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
56526bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
56536bce7ff8SHong Zhang   PetscErrorCode ierr;
56545a586d82SBarry Smith   const PetscInt *r,*ic;
56556bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
56566bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5657b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5658914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5659914a18a2SHong Zhang   MatScalar      *v_work;
5660ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity;
56616bce7ff8SHong Zhang 
56626bce7ff8SHong Zhang   PetscFunctionBegin;
56636bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
56646bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5665ae3d28f0SHong Zhang 
5666fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5667fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
56686bce7ff8SHong Zhang 
5669914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
5670fca92195SBarry Smith   ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5671914a18a2SHong Zhang 
56726bce7ff8SHong Zhang   for (i=0; i<n; i++) {
56736bce7ff8SHong Zhang     /* zero rtmp */
56746bce7ff8SHong Zhang     /* L part */
56756bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
56766bce7ff8SHong Zhang     bjtmp = bj + bi[i];
5677914a18a2SHong Zhang     for  (j=0; j<nz; j++) {
5678914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5679914a18a2SHong Zhang     }
56806bce7ff8SHong Zhang 
56816bce7ff8SHong Zhang     /* U part */
56821a83e813SShri Abhyankar     nz    = bdiag[i] - bdiag[i+1];
56831a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
56841a83e813SShri Abhyankar     for  (j=0; j<nz; j++) {
56851a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56861a83e813SShri Abhyankar     }
56871a83e813SShri Abhyankar 
56881a83e813SShri Abhyankar     /* load in initial (unfactored row) */
56891a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
56901a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
56911a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
56921a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
56931a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
56941a83e813SShri Abhyankar     }
56951a83e813SShri Abhyankar 
56961a83e813SShri Abhyankar     /* elimination */
56971a83e813SShri Abhyankar     bjtmp = bj + bi[i];
56981a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
56991a83e813SShri Abhyankar     for (k=0; k < nzL; k++) {
57001a83e813SShri Abhyankar       row = bjtmp[k];
57011a83e813SShri Abhyankar       pc  = rtmp + bs2*row;
5702c35f09e5SBarry Smith       for (flg=0,j=0; j<bs2; j++) {
5703c35f09e5SBarry Smith         if (pc[j]!=0.0) {
5704c35f09e5SBarry Smith           flg = 1;
5705c35f09e5SBarry Smith           break;
5706c35f09e5SBarry Smith         }
5707c35f09e5SBarry Smith       }
57081a83e813SShri Abhyankar       if (flg) {
57091a83e813SShri Abhyankar         pv = b->a + bs2*bdiag[row];
571096b95a6bSBarry Smith         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
57111a83e813SShri Abhyankar         pj = b->j + bdiag[row+1]+1;         /* begining of U(row,:) */
57121a83e813SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
57131a83e813SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1;         /* num of entries inU(row,:), excluding diag */
57141a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
571596b95a6bSBarry Smith           PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
57161a83e813SShri Abhyankar         }
57171a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
57181a83e813SShri Abhyankar       }
57191a83e813SShri Abhyankar     }
57201a83e813SShri Abhyankar 
57211a83e813SShri Abhyankar     /* finished row so stick it into b->a */
57221a83e813SShri Abhyankar     /* L part */
57231a83e813SShri Abhyankar     pv = b->a + bs2*bi[i];
57241a83e813SShri Abhyankar     pj = b->j + bi[i];
57251a83e813SShri Abhyankar     nz = bi[i+1] - bi[i];
57261a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
57271a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57281a83e813SShri Abhyankar     }
57291a83e813SShri Abhyankar 
57301a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
57311a83e813SShri Abhyankar     pv = b->a + bs2*bdiag[i];
57321a83e813SShri Abhyankar     pj = b->j + bdiag[i];
5733e32f2f54SBarry Smith     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
57341a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
573596b95a6bSBarry Smith     ierr = PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
57361a83e813SShri Abhyankar 
57371a83e813SShri Abhyankar     /* U part */
57381a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
57391a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
57401a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
57411a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
57421a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57431a83e813SShri Abhyankar     }
57441a83e813SShri Abhyankar   }
57451a83e813SShri Abhyankar 
57461a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5747fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
57481a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
57491a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
57501a83e813SShri Abhyankar 
5751ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5752ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5753*26fbe8dcSKarl Rupp 
5754ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
5755ae3d28f0SHong Zhang   if (both_identity) {
57564dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5757ae3d28f0SHong Zhang   } else {
57584dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N;
5759ae3d28f0SHong Zhang   }
57604dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5761ae3d28f0SHong Zhang 
57621a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
5763*26fbe8dcSKarl Rupp 
5764766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
57651a83e813SShri Abhyankar   PetscFunctionReturn(0);
57661a83e813SShri Abhyankar }
57671a83e813SShri Abhyankar 
57686bce7ff8SHong Zhang /*
57696bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
57704dd39f65SShri Abhyankar    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
57714dd39f65SShri Abhyankar    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
57726bce7ff8SHong Zhang */
5773c0c7eb62SShri Abhyankar 
57746bce7ff8SHong Zhang #undef __FUNCT__
57754dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
57764dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
57776bce7ff8SHong Zhang {
57786bce7ff8SHong Zhang 
57796bce7ff8SHong Zhang   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
57806bce7ff8SHong Zhang   PetscErrorCode ierr;
578116a2bf60SHong Zhang   PetscInt       n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
578235aa4fcfSShri Abhyankar   PetscInt       i,j,nz,*bi,*bj,*bdiag,bi_temp;
578335aa4fcfSShri Abhyankar 
578435aa4fcfSShri Abhyankar   PetscFunctionBegin;
578535aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
578635aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
578735aa4fcfSShri Abhyankar 
578835aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
578935aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
579035aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5791*26fbe8dcSKarl Rupp 
579235aa4fcfSShri Abhyankar   b->singlemalloc    = PETSC_TRUE;
5793379be0ddSLisandro Dalcin   b->free_a          = PETSC_TRUE;
5794379be0ddSLisandro Dalcin   b->free_ij         = PETSC_TRUE;
57951e40a84eSLisandro Dalcin   fact->preallocated = PETSC_TRUE;
57961e40a84eSLisandro Dalcin   fact->assembled    = PETSC_TRUE;
579735aa4fcfSShri Abhyankar   if (!b->diag) {
579835aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
579935aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
580035aa4fcfSShri Abhyankar   }
580135aa4fcfSShri Abhyankar   bdiag = b->diag;
580235aa4fcfSShri Abhyankar 
580335aa4fcfSShri Abhyankar   if (n > 0) {
580435aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
580535aa4fcfSShri Abhyankar   }
580635aa4fcfSShri Abhyankar 
580735aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
580835aa4fcfSShri Abhyankar   bi = b->i;
580935aa4fcfSShri Abhyankar   bj = b->j;
581035aa4fcfSShri Abhyankar 
581135aa4fcfSShri Abhyankar   /* L part */
581235aa4fcfSShri Abhyankar   bi[0] = 0;
581335aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
581435aa4fcfSShri Abhyankar     nz      = adiag[i] - ai[i];
581535aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
581635aa4fcfSShri Abhyankar     aj      = a->j + ai[i];
581735aa4fcfSShri Abhyankar     for (j=0; j<nz; j++) {
581835aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
581935aa4fcfSShri Abhyankar     }
582035aa4fcfSShri Abhyankar   }
582135aa4fcfSShri Abhyankar 
582235aa4fcfSShri Abhyankar   /* U part */
582335aa4fcfSShri Abhyankar   bi_temp  = bi[n];
582435aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
582535aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--) {
582635aa4fcfSShri Abhyankar     nz      = ai[i+1] - adiag[i] - 1;
582735aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
582835aa4fcfSShri Abhyankar     aj      = a->j + adiag[i] + 1;
582935aa4fcfSShri Abhyankar     for (j=0; j<nz; j++) {
583035aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
583135aa4fcfSShri Abhyankar     }
583235aa4fcfSShri Abhyankar     /* diag[i] */
583335aa4fcfSShri Abhyankar     *bj      = i; bj++;
583435aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
583535aa4fcfSShri Abhyankar   }
583635aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
583735aa4fcfSShri Abhyankar }
583835aa4fcfSShri Abhyankar 
583935aa4fcfSShri Abhyankar #undef __FUNCT__
58404dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
58414dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
584216a2bf60SHong Zhang {
584316a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
584416a2bf60SHong Zhang   IS                 isicol;
584516a2bf60SHong Zhang   PetscErrorCode     ierr;
584616a2bf60SHong Zhang   const PetscInt     *r,*ic;
58477fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
584816a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
584916a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
585016a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
5851ace3abfcSBarry Smith   PetscBool          col_identity,row_identity,both_identity;
585216a2bf60SHong Zhang   PetscReal          f;
585316a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
585416a2bf60SHong Zhang   PetscBT            lnkbt;
585516a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
585616a2bf60SHong Zhang   PetscFreeSpaceList free_space    =PETSC_NULL,current_space=PETSC_NULL;
585716a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5858ace3abfcSBarry Smith   PetscBool          missing;
58597fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
586016a2bf60SHong Zhang 
586116a2bf60SHong Zhang   PetscFunctionBegin;
5862e32f2f54SBarry Smith   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
58636ba06ab7SHong Zhang   if (bs>1) {  /* check shifttype */
58646ba06ab7SHong Zhang     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
58656ba06ab7SHong Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
58666ba06ab7SHong Zhang   }
58676ba06ab7SHong Zhang 
586816a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5869e32f2f54SBarry Smith   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
587016a2bf60SHong Zhang 
587116a2bf60SHong Zhang   f             = info->fill;
587216a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
587316a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
5874*26fbe8dcSKarl Rupp 
587516a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
587616a2bf60SHong Zhang 
587716a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
587816a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5879*26fbe8dcSKarl Rupp 
5880ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
588116a2bf60SHong Zhang 
58827fa3a6a0SHong Zhang   if (!levels && both_identity) {
588316a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
58844dd39f65SShri Abhyankar     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
58854dd39f65SShri Abhyankar     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
588635aa4fcfSShri Abhyankar 
5887d5f3da31SBarry Smith     fact->factortype               = MAT_FACTOR_ILU;
588835aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
588935aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
589035aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
5891*26fbe8dcSKarl Rupp 
589235aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
589335aa4fcfSShri Abhyankar     b->row           = isrow;
589435aa4fcfSShri Abhyankar     b->col           = iscol;
589535aa4fcfSShri Abhyankar     b->icol          = isicol;
589635aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
589735aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
589835aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5899*26fbe8dcSKarl Rupp 
590035aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
590135aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
590235aa4fcfSShri Abhyankar   }
590335aa4fcfSShri Abhyankar 
590435aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
590535aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
590635aa4fcfSShri Abhyankar 
590735aa4fcfSShri Abhyankar   /* get new row pointers */
590835aa4fcfSShri Abhyankar   ierr  = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
590935aa4fcfSShri Abhyankar   bi[0] = 0;
591035aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
591135aa4fcfSShri Abhyankar   ierr     = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
591235aa4fcfSShri Abhyankar   bdiag[0] = 0;
591335aa4fcfSShri Abhyankar 
5914fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
591535aa4fcfSShri Abhyankar 
591635aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
591735aa4fcfSShri Abhyankar   nlnk = n + 1;
591835aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
591935aa4fcfSShri Abhyankar 
592035aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
592135aa4fcfSShri Abhyankar   ierr              = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
592235aa4fcfSShri Abhyankar   current_space     = free_space;
592335aa4fcfSShri Abhyankar   ierr              = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
592435aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
592535aa4fcfSShri Abhyankar 
592635aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
592735aa4fcfSShri Abhyankar     nzi = 0;
592835aa4fcfSShri Abhyankar     /* copy current row into linked list */
592935aa4fcfSShri Abhyankar     nnz = ai[r[i]+1] - ai[r[i]];
5930e32f2f54SBarry Smith     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
593135aa4fcfSShri Abhyankar     cols   = aj + ai[r[i]];
593235aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
593335aa4fcfSShri Abhyankar     ierr   = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
593435aa4fcfSShri Abhyankar     nzi   += nlnk;
593535aa4fcfSShri Abhyankar 
593635aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
593735aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
593835aa4fcfSShri Abhyankar       fm = n;
593935aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
594035aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
594135aa4fcfSShri Abhyankar       lnk[fm]    = i;
594235aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
594335aa4fcfSShri Abhyankar       nzi++; dcount++;
594435aa4fcfSShri Abhyankar     }
594535aa4fcfSShri Abhyankar 
594635aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
594735aa4fcfSShri Abhyankar     nzbd = 0;
594835aa4fcfSShri Abhyankar     prow = lnk[n];
594935aa4fcfSShri Abhyankar     while (prow < i) {
595035aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
595135aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
595235aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
595335aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5954*26fbe8dcSKarl Rupp 
595535aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
595635aa4fcfSShri Abhyankar       nzi += nlnk;
595735aa4fcfSShri Abhyankar       prow = lnk[prow];
595835aa4fcfSShri Abhyankar       nzbd++;
595935aa4fcfSShri Abhyankar     }
596035aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
596135aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
596235aa4fcfSShri Abhyankar 
596335aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
596435aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
596535aa4fcfSShri Abhyankar       nnz  = 2*nzi*(n - i); /* estimated and max additional space needed */
596635aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
596735aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
596835aa4fcfSShri Abhyankar       reallocs++;
596935aa4fcfSShri Abhyankar     }
597035aa4fcfSShri Abhyankar 
597135aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
597235aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5973*26fbe8dcSKarl Rupp 
597435aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
597535aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
597635aa4fcfSShri Abhyankar 
597735aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
597865e19b50SBarry Smith     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
597935aa4fcfSShri Abhyankar 
598035aa4fcfSShri Abhyankar     current_space->array           += nzi;
598135aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
598235aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
5983*26fbe8dcSKarl Rupp 
598435aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
598535aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
598635aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
598735aa4fcfSShri Abhyankar   }
598835aa4fcfSShri Abhyankar 
598935aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
599035aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
599135aa4fcfSShri Abhyankar 
599235aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
59939263d837SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
59942ce24eb6SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
599535aa4fcfSShri Abhyankar 
599635aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
599735aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5998fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
599935aa4fcfSShri Abhyankar 
600035aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
600135aa4fcfSShri Abhyankar   {
6002aef85c9fSShri Abhyankar     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
600335aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
600435aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
600535aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
600635aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
600735aa4fcfSShri Abhyankar     if (diagonal_fill) {
600835aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
600935aa4fcfSShri Abhyankar     }
601035aa4fcfSShri Abhyankar   }
601135aa4fcfSShri Abhyankar #endif
601235aa4fcfSShri Abhyankar 
601335aa4fcfSShri Abhyankar   /* put together the new matrix */
601435aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
601535aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6016*26fbe8dcSKarl Rupp 
601735aa4fcfSShri Abhyankar   b               = (Mat_SeqBAIJ*)(fact)->data;
601835aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
601935aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
602035aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
6021*26fbe8dcSKarl Rupp 
602235aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6023*26fbe8dcSKarl Rupp 
602435aa4fcfSShri Abhyankar   b->j          = bj;
602535aa4fcfSShri Abhyankar   b->i          = bi;
602635aa4fcfSShri Abhyankar   b->diag       = bdiag;
602735aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
602835aa4fcfSShri Abhyankar   b->ilen       = 0;
602935aa4fcfSShri Abhyankar   b->imax       = 0;
603035aa4fcfSShri Abhyankar   b->row        = isrow;
603135aa4fcfSShri Abhyankar   b->col        = iscol;
603235aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
603335aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
603435aa4fcfSShri Abhyankar   b->icol       = isicol;
6035*26fbe8dcSKarl Rupp 
603635aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
603735aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
603835aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
603935aa4fcfSShri Abhyankar   ierr     = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
604035aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
6041*26fbe8dcSKarl Rupp 
6042ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
6043ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
6044ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6045*26fbe8dcSKarl Rupp 
60464dd39f65SShri Abhyankar   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
604735aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
604835aa4fcfSShri Abhyankar }
604935aa4fcfSShri Abhyankar 
60504e2b4712SSatish Balay /*
60514e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
60524e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
60534e2b4712SSatish Balay    Not a good example of code reuse.
60544e2b4712SSatish Balay */
60554a2ae208SSatish Balay #undef __FUNCT__
605606e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
605706e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
60584e2b4712SSatish Balay {
60594e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
60604e2b4712SSatish Balay   IS             isicol;
60616849ba73SBarry Smith   PetscErrorCode ierr;
60625d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
60635d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6064a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6065d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6066ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity,flg;
6067329f5518SBarry Smith   PetscReal      f;
60684e2b4712SSatish Balay 
60694e2b4712SSatish Balay   PetscFunctionBegin;
60706bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6071e32f2f54SBarry Smith   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
60726bce7ff8SHong Zhang 
6073435faa5fSBarry Smith   f             = info->fill;
6074690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
6075690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
6076*26fbe8dcSKarl Rupp 
60774c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
607816a2bf60SHong Zhang 
6079667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6080667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6081ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
6082309c388cSBarry Smith 
608341df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
608416a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
60858b1456e3SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
60866bce7ff8SHong Zhang 
6087d5f3da31SBarry Smith     fact->factortype = MAT_FACTOR_ILU;
6088ae3d28f0SHong Zhang     b                = (Mat_SeqBAIJ*)fact->data;
6089bb3d539aSBarry Smith     b->row           = isrow;
6090bb3d539aSBarry Smith     b->col           = iscol;
6091bb3d539aSBarry Smith     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6092bb3d539aSBarry Smith     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6093bb3d539aSBarry Smith     b->icol          = isicol;
6094bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6095*26fbe8dcSKarl Rupp 
6096b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
60976bce7ff8SHong Zhang     PetscFunctionReturn(0);
60986bce7ff8SHong Zhang   }
60996bce7ff8SHong Zhang 
61006bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
61014e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
61024e2b4712SSatish Balay   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
61034e2b4712SSatish Balay 
61044e2b4712SSatish Balay   /* get new row pointers */
6105690b6cddSBarry Smith   ierr     = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
61064e2b4712SSatish Balay   ainew[0] = 0;
61074e2b4712SSatish Balay   /* don't know how many column pointers are needed so estimate */
6108690b6cddSBarry Smith   jmax = (PetscInt)(f*ai[n] + 1);
6109690b6cddSBarry Smith   ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
61104e2b4712SSatish Balay   /* ajfill is level of fill for each fill entry */
6111690b6cddSBarry Smith   ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
61124e2b4712SSatish Balay   /* fill is a linked list of nonzeros in active row */
6113690b6cddSBarry Smith   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
61144e2b4712SSatish Balay   /* im is level for each filled value */
6115690b6cddSBarry Smith   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
61164e2b4712SSatish Balay   /* dloc is location of diagonal in factor */
6117690b6cddSBarry Smith   ierr    = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
61184e2b4712SSatish Balay   dloc[0] = 0;
61194e2b4712SSatish Balay   for (prow=0; prow<n; prow++) {
6120435faa5fSBarry Smith 
6121435faa5fSBarry Smith     /* copy prow into linked list */
61224e2b4712SSatish Balay     nzf = nz = ai[r[prow]+1] - ai[r[prow]];
6123e32f2f54SBarry Smith     if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
61244e2b4712SSatish Balay     xi         = aj + ai[r[prow]];
61254e2b4712SSatish Balay     fill[n]    = n;
6126435faa5fSBarry Smith     fill[prow] = -1;   /* marker for diagonal entry */
61274e2b4712SSatish Balay     while (nz--) {
61284e2b4712SSatish Balay       fm  = n;
61294e2b4712SSatish Balay       idx = ic[*xi++];
61304e2b4712SSatish Balay       do {
61314e2b4712SSatish Balay         m  = fm;
61324e2b4712SSatish Balay         fm = fill[m];
61334e2b4712SSatish Balay       } while (fm < idx);
61344e2b4712SSatish Balay       fill[m]   = idx;
61354e2b4712SSatish Balay       fill[idx] = fm;
61364e2b4712SSatish Balay       im[idx]   = 0;
61374e2b4712SSatish Balay     }
6138435faa5fSBarry Smith 
6139435faa5fSBarry Smith     /* make sure diagonal entry is included */
6140435faa5fSBarry Smith     if (diagonal_fill && fill[prow] == -1) {
6141435faa5fSBarry Smith       fm = n;
6142435faa5fSBarry Smith       while (fill[fm] < prow) fm = fill[fm];
6143435faa5fSBarry Smith       fill[prow] = fill[fm];    /* insert diagonal into linked list */
6144435faa5fSBarry Smith       fill[fm]   = prow;
6145435faa5fSBarry Smith       im[prow]   = 0;
6146435faa5fSBarry Smith       nzf++;
6147335d9088SBarry Smith       dcount++;
6148435faa5fSBarry Smith     }
6149435faa5fSBarry Smith 
61504e2b4712SSatish Balay     nzi = 0;
61514e2b4712SSatish Balay     row = fill[n];
61524e2b4712SSatish Balay     while (row < prow) {
61534e2b4712SSatish Balay       incrlev = im[row] + 1;
61544e2b4712SSatish Balay       nz      = dloc[row];
6155435faa5fSBarry Smith       xi      = ajnew  + ainew[row] + nz + 1;
61564e2b4712SSatish Balay       flev    = ajfill + ainew[row] + nz + 1;
61574e2b4712SSatish Balay       nnz     = ainew[row+1] - ainew[row] - nz - 1;
61584e2b4712SSatish Balay       fm      = row;
61594e2b4712SSatish Balay       while (nnz-- > 0) {
61604e2b4712SSatish Balay         idx = *xi++;
61614e2b4712SSatish Balay         if (*flev + incrlev > levels) {
61624e2b4712SSatish Balay           flev++;
61634e2b4712SSatish Balay           continue;
61644e2b4712SSatish Balay         }
61654e2b4712SSatish Balay         do {
61664e2b4712SSatish Balay           m  = fm;
61674e2b4712SSatish Balay           fm = fill[m];
61684e2b4712SSatish Balay         } while (fm < idx);
61694e2b4712SSatish Balay         if (fm != idx) {
61704e2b4712SSatish Balay           im[idx]   = *flev + incrlev;
61714e2b4712SSatish Balay           fill[m]   = idx;
61724e2b4712SSatish Balay           fill[idx] = fm;
61734e2b4712SSatish Balay           fm        = idx;
61744e2b4712SSatish Balay           nzf++;
6175*26fbe8dcSKarl Rupp         } else if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
61764e2b4712SSatish Balay         flev++;
61774e2b4712SSatish Balay       }
61784e2b4712SSatish Balay       row = fill[row];
61794e2b4712SSatish Balay       nzi++;
61804e2b4712SSatish Balay     }
61814e2b4712SSatish Balay     /* copy new filled row into permanent storage */
61824e2b4712SSatish Balay     ainew[prow+1] = ainew[prow] + nzf;
61834e2b4712SSatish Balay     if (ainew[prow+1] > jmax) {
6184ecf371e4SBarry Smith 
6185ecf371e4SBarry Smith       /* estimate how much additional space we will need */
6186ecf371e4SBarry Smith       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6187ecf371e4SBarry Smith       /* just double the memory each time */
6188690b6cddSBarry Smith       PetscInt maxadd = jmax;
6189ecf371e4SBarry Smith       /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
61904e2b4712SSatish Balay       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
61914e2b4712SSatish Balay       jmax += maxadd;
6192ecf371e4SBarry Smith 
6193ecf371e4SBarry Smith       /* allocate a longer ajnew and ajfill */
61945d0c19d7SBarry Smith       ierr   = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
61955d0c19d7SBarry Smith       ierr   = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6196606d414cSSatish Balay       ierr   = PetscFree(ajnew);CHKERRQ(ierr);
61975d0c19d7SBarry Smith       ajnew  = xitmp;
61985d0c19d7SBarry Smith       ierr   = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
61995d0c19d7SBarry Smith       ierr   = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6200606d414cSSatish Balay       ierr   = PetscFree(ajfill);CHKERRQ(ierr);
62015d0c19d7SBarry Smith       ajfill = xitmp;
6202eb150c5cSKris Buschelman       reallocate++;   /* count how many reallocations are needed */
62034e2b4712SSatish Balay     }
62045d0c19d7SBarry Smith     xitmp      = ajnew + ainew[prow];
62054e2b4712SSatish Balay     flev       = ajfill + ainew[prow];
62064e2b4712SSatish Balay     dloc[prow] = nzi;
62074e2b4712SSatish Balay     fm         = fill[n];
62084e2b4712SSatish Balay     while (nzf--) {
62095d0c19d7SBarry Smith       *xitmp++ = fm;
62104e2b4712SSatish Balay       *flev++  = im[fm];
62114e2b4712SSatish Balay       fm       = fill[fm];
62124e2b4712SSatish Balay     }
6213435faa5fSBarry Smith     /* make sure row has diagonal entry */
6214f23aa3ddSBarry Smith     if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
62152401956bSBarry Smith                                                         try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6216435faa5fSBarry Smith   }
6217606d414cSSatish Balay   ierr = PetscFree(ajfill);CHKERRQ(ierr);
62184e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
62194e2b4712SSatish Balay   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6220606d414cSSatish Balay   ierr = PetscFree(fill);CHKERRQ(ierr);
6221606d414cSSatish Balay   ierr = PetscFree(im);CHKERRQ(ierr);
62224e2b4712SSatish Balay 
62236cf91177SBarry Smith #if defined(PETSC_USE_INFO)
62244e2b4712SSatish Balay   {
6225329f5518SBarry Smith     PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6226ae15b995SBarry Smith     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6227ae15b995SBarry Smith     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6228ae15b995SBarry Smith     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6229ae15b995SBarry Smith     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6230335d9088SBarry Smith     if (diagonal_fill) {
6231ae15b995SBarry Smith       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6232335d9088SBarry Smith     }
62334e2b4712SSatish Balay   }
623463ba0a88SBarry Smith #endif
62354e2b4712SSatish Balay 
62364e2b4712SSatish Balay   /* put together the new matrix */
6237719d5645SBarry Smith   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6238719d5645SBarry Smith   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6239ae3d28f0SHong Zhang   b    = (Mat_SeqBAIJ*)fact->data;
6240*26fbe8dcSKarl Rupp 
6241e6b907acSBarry Smith   b->free_a       = PETSC_TRUE;
6242e6b907acSBarry Smith   b->free_ij      = PETSC_TRUE;
62437c922b88SBarry Smith   b->singlemalloc = PETSC_FALSE;
6244*26fbe8dcSKarl Rupp 
6245a96a251dSBarry Smith   ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6246*26fbe8dcSKarl Rupp 
62474e2b4712SSatish Balay   b->j          = ajnew;
62484e2b4712SSatish Balay   b->i          = ainew;
62494e2b4712SSatish Balay   for (i=0; i<n; i++) dloc[i] += ainew[i];
62504e2b4712SSatish Balay   b->diag          = dloc;
62517f53bb6cSHong Zhang   b->free_diag     = PETSC_TRUE;
62524e2b4712SSatish Balay   b->ilen          = 0;
62534e2b4712SSatish Balay   b->imax          = 0;
62544e2b4712SSatish Balay   b->row           = isrow;
62554e2b4712SSatish Balay   b->col           = iscol;
6256bcd9e38bSBarry Smith   b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6257*26fbe8dcSKarl Rupp 
6258c38d4ed2SBarry Smith   ierr    = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6259c38d4ed2SBarry Smith   ierr    = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6260e51c0b9cSSatish Balay   b->icol = isicol;
626187828ca2SBarry Smith   ierr    = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
62624e2b4712SSatish Balay   /* In b structure:  Free imax, ilen, old a, old j.
62634e2b4712SSatish Balay      Allocate dloc, solve_work, new a, new j */
6264719d5645SBarry Smith   ierr     = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
62654e2b4712SSatish Balay   b->maxnz = b->nz = ainew[n];
62664e2b4712SSatish Balay 
6267ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocate;
6268ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
6269ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
62706bce7ff8SHong Zhang 
62718b1456e3SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
62728661488fSKris Buschelman   PetscFunctionReturn(0);
62738661488fSKris Buschelman }
62748661488fSKris Buschelman 
6275732ee342SKris Buschelman #undef __FUNCT__
62767e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6277dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
62787e7071cdSKris Buschelman {
627912272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
628012272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
62815fd66863SKarl Rupp 
62825a9542e3SKris Buschelman   PetscFunctionBegin;
62837cf1b8d3SKris Buschelman   /* Undo Column scaling */
62847cf1b8d3SKris Buschelman   /*    while (nz--) { */
62857cf1b8d3SKris Buschelman   /*      AJ[i] = AJ[i]/4; */
62867cf1b8d3SKris Buschelman   /*    } */
6287c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
6288c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
62897cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
62907cf1b8d3SKris Buschelman }
62917cf1b8d3SKris Buschelman 
62927cf1b8d3SKris Buschelman #undef __FUNCT__
62937cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6294dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
62957cf1b8d3SKris Buschelman {
62967cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
6297b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
62982aa5897fSKris Buschelman   unsigned short *aj=(unsigned short*)AJ;
62995fd66863SKarl Rupp 
63005a9542e3SKris Buschelman   PetscFunctionBegin;
63010b9da03eSKris Buschelman   /* Is this really necessary? */
630220235379SKris Buschelman   while (nz--) {
63030b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
63047e7071cdSKris Buschelman   }
6305c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
63067e7071cdSKris Buschelman   PetscFunctionReturn(0);
63077e7071cdSKris Buschelman }
63087e7071cdSKris Buschelman 
6309732ee342SKris Buschelman 
6310