xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 6e111a19f6677190c8cb13236301fcb65e0e3d3b)
1be1d678aSKris Buschelman 
24e2b4712SSatish Balay /*
34e2b4712SSatish Balay     Factorization code for BAIJ format.
44e2b4712SSatish Balay */
54e2b4712SSatish Balay 
6c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
7c6db04a5SJed Brown #include <../src/mat/blockinvert.h>
8c6db04a5SJed Brown #include <petscbt.h>
9c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
1293fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
1393fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1493fd935bSShri Abhyankar {
1593fd935bSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1693fd935bSShri Abhyankar   PetscErrorCode    ierr;
1793fd935bSShri Abhyankar   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
1893fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
1993fd935bSShri Abhyankar   PetscInt          nz;
2093fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
2193fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
2293fd935bSShri Abhyankar   const PetscScalar *b;
2393fd935bSShri Abhyankar 
2493fd935bSShri Abhyankar   PetscFunctionBegin;
253649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2693fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2793fd935bSShri Abhyankar   tmp  = a->solve_work;
2893fd935bSShri Abhyankar 
2993fd935bSShri Abhyankar 
3093fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
3193fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[i];
3293fd935bSShri Abhyankar 
3393fd935bSShri Abhyankar   /* forward solve the U^T */
3493fd935bSShri Abhyankar   for (i=0; i<n; i++) {
3593fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
3693fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
3793fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
3893fd935bSShri Abhyankar     s1  = tmp[i];
3993fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
4093fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
4193fd935bSShri Abhyankar     tmp[i] = s1;
4293fd935bSShri Abhyankar   }
4393fd935bSShri Abhyankar 
4493fd935bSShri Abhyankar   /* backward solve the L^T */
4593fd935bSShri Abhyankar   for (i=n-1; i>=0; i--) {
4693fd935bSShri Abhyankar     v   = aa + ai[i];
4793fd935bSShri Abhyankar     vi  = aj + ai[i];
4893fd935bSShri Abhyankar     nz  = ai[i+1] - ai[i];
4993fd935bSShri Abhyankar     s1  = tmp[i];
5093fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
5193fd935bSShri Abhyankar   }
5293fd935bSShri Abhyankar 
5393fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
5493fd935bSShri Abhyankar   for (i=0; i<n; i++) x[i] = tmp[i];
5593fd935bSShri Abhyankar 
563649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5793fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5893fd935bSShri Abhyankar 
5993fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
6093fd935bSShri Abhyankar   PetscFunctionReturn(0);
6193fd935bSShri Abhyankar }
6293fd935bSShri Abhyankar 
6393fd935bSShri Abhyankar #undef __FUNCT__
6406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
6506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66f1af5d2fSBarry Smith {
67f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
68dfbe8321SBarry Smith   PetscErrorCode    ierr;
690b68f018SBarry Smith   PetscInt          i,nz;
700b68f018SBarry Smith   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
710b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
720b68f018SBarry Smith   PetscScalar       s1,*x;
73f1af5d2fSBarry Smith 
74f1af5d2fSBarry Smith   PetscFunctionBegin;
75ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
77f1af5d2fSBarry Smith 
78f1af5d2fSBarry Smith   /* forward solve the U^T */
79f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
80f1af5d2fSBarry Smith 
81f1af5d2fSBarry Smith     v     = aa + diag[i];
82f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
83ef66eb69SBarry Smith     s1    = (*v++)*x[i];
84f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
85f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
86f1af5d2fSBarry Smith     while (nz--) {
87f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
88f1af5d2fSBarry Smith     }
89f1af5d2fSBarry Smith     x[i]   = s1;
90f1af5d2fSBarry Smith   }
91f1af5d2fSBarry Smith   /* backward solve the L^T */
92f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
93f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
94f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
95f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
96f1af5d2fSBarry Smith     s1   = x[i];
97f1af5d2fSBarry Smith     while (nz--) {
98f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
99f1af5d2fSBarry Smith     }
100f1af5d2fSBarry Smith   }
1011ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
102dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
103f1af5d2fSBarry Smith   PetscFunctionReturn(0);
104f1af5d2fSBarry Smith }
105f1af5d2fSBarry Smith 
1064a2ae208SSatish Balay #undef __FUNCT__
10706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
10806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109f1af5d2fSBarry Smith {
110f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
111dfbe8321SBarry Smith   PetscErrorCode    ierr;
112b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
113b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
115b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
116f1af5d2fSBarry Smith 
117f1af5d2fSBarry Smith   PetscFunctionBegin;
118ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1191ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
120f1af5d2fSBarry Smith 
121f1af5d2fSBarry Smith   /* forward solve the U^T */
122f1af5d2fSBarry Smith   idx = 0;
123f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
124f1af5d2fSBarry Smith 
125f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
126f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
127ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
128f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
129f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
130f1af5d2fSBarry Smith     v += 4;
131f1af5d2fSBarry Smith 
132f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
133f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
134f1af5d2fSBarry Smith     while (nz--) {
135f1af5d2fSBarry Smith       oidx = 2*(*vi++);
136f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
137f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
138f1af5d2fSBarry Smith       v  += 4;
139f1af5d2fSBarry Smith     }
140f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
141f1af5d2fSBarry Smith     idx += 2;
142f1af5d2fSBarry Smith   }
143f1af5d2fSBarry Smith   /* backward solve the L^T */
144f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
145f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
146f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
147f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
148f1af5d2fSBarry Smith     idt  = 2*i;
149f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
150f1af5d2fSBarry Smith     while (nz--) {
151f1af5d2fSBarry Smith       idx   = 2*(*vi--);
152f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
153f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
154f1af5d2fSBarry Smith       v -= 4;
155f1af5d2fSBarry Smith     }
156f1af5d2fSBarry Smith   }
1571ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
158dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
159f1af5d2fSBarry Smith   PetscFunctionReturn(0);
160f1af5d2fSBarry Smith }
161f1af5d2fSBarry Smith 
1624a2ae208SSatish Balay #undef __FUNCT__
1634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
1644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1656929473cSShri Abhyankar {
1666929473cSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1676929473cSShri Abhyankar   PetscErrorCode    ierr;
168b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1696929473cSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
170b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
171b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
172b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
1736929473cSShri Abhyankar 
1746929473cSShri Abhyankar   PetscFunctionBegin;
1756929473cSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1766929473cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1776929473cSShri Abhyankar 
1786929473cSShri Abhyankar   /* forward solve the U^T */
1796929473cSShri Abhyankar   idx = 0;
1806929473cSShri Abhyankar   for (i=0; i<n; i++) {
1816929473cSShri Abhyankar     v     = aa + bs2*diag[i];
1826929473cSShri Abhyankar     /* multiply by the inverse of the block diagonal */
1836929473cSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];
1846929473cSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
1856929473cSShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
1866929473cSShri Abhyankar     v -= bs2;
1876929473cSShri Abhyankar 
1886929473cSShri Abhyankar     vi    = aj + diag[i] - 1;
1896929473cSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
1906929473cSShri Abhyankar     for (j=0;j>-nz;j--) {
1916929473cSShri Abhyankar       oidx = bs*vi[j];
1926929473cSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2;
1936929473cSShri Abhyankar       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
1946929473cSShri Abhyankar       v  -= bs2;
1956929473cSShri Abhyankar     }
1966929473cSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;
1976929473cSShri Abhyankar     idx += bs;
1986929473cSShri Abhyankar   }
1996929473cSShri Abhyankar   /* backward solve the L^T */
2006929473cSShri Abhyankar   for (i=n-1; i>=0; i--) {
2016929473cSShri Abhyankar     v    = aa + bs2*ai[i];
2026929473cSShri Abhyankar     vi   = aj + ai[i];
2036929473cSShri Abhyankar     nz   = ai[i+1] - ai[i];
2046929473cSShri Abhyankar     idt  = bs*i;
2056929473cSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];
2066929473cSShri Abhyankar     for (j=0;j<nz;j++) {
2076929473cSShri Abhyankar       idx   = bs*vi[j];
2086929473cSShri Abhyankar       x[idx]   -=  v[0]*s1 +  v[1]*s2;
2096929473cSShri Abhyankar       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
2106929473cSShri Abhyankar       v += bs2;
2116929473cSShri Abhyankar     }
2126929473cSShri Abhyankar   }
2136929473cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2146929473cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2156929473cSShri Abhyankar   PetscFunctionReturn(0);
2166929473cSShri Abhyankar }
2176929473cSShri Abhyankar 
2186929473cSShri Abhyankar #undef __FUNCT__
21906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
22006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221f1af5d2fSBarry Smith {
222f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
223dfbe8321SBarry Smith   PetscErrorCode    ierr;
224b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
226b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
227b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
228f1af5d2fSBarry Smith 
229f1af5d2fSBarry Smith   PetscFunctionBegin;
230ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
232f1af5d2fSBarry Smith 
233f1af5d2fSBarry Smith   /* forward solve the U^T */
234f1af5d2fSBarry Smith   idx = 0;
235f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
236f1af5d2fSBarry Smith 
237f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
238f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
239ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
240f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
241f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
242f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
243f1af5d2fSBarry Smith     v += 9;
244f1af5d2fSBarry Smith 
245f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
246f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
247f1af5d2fSBarry Smith     while (nz--) {
248f1af5d2fSBarry Smith       oidx = 3*(*vi++);
249f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
250f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
251f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252f1af5d2fSBarry Smith       v  += 9;
253f1af5d2fSBarry Smith     }
254f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
255f1af5d2fSBarry Smith     idx += 3;
256f1af5d2fSBarry Smith   }
257f1af5d2fSBarry Smith   /* backward solve the L^T */
258f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
259f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
260f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
261f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
262f1af5d2fSBarry Smith     idt  = 3*i;
263f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
264f1af5d2fSBarry Smith     while (nz--) {
265f1af5d2fSBarry Smith       idx   = 3*(*vi--);
266f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
267f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
268f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269f1af5d2fSBarry Smith       v -= 9;
270f1af5d2fSBarry Smith     }
271f1af5d2fSBarry Smith   }
2721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
273dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
274f1af5d2fSBarry Smith   PetscFunctionReturn(0);
275f1af5d2fSBarry Smith }
276f1af5d2fSBarry Smith 
2774a2ae208SSatish Balay #undef __FUNCT__
2784dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
2794dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2808499736aSShri Abhyankar {
2818499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2828499736aSShri Abhyankar   PetscErrorCode    ierr;
283b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2848499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
285b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
286b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
287b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
2888499736aSShri Abhyankar 
2898499736aSShri Abhyankar   PetscFunctionBegin;
2908499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2918499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2928499736aSShri Abhyankar 
2938499736aSShri Abhyankar   /* forward solve the U^T */
2948499736aSShri Abhyankar   idx = 0;
2958499736aSShri Abhyankar   for (i=0; i<n; i++) {
2968499736aSShri Abhyankar     v     = aa + bs2*diag[i];
2978499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
2988499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
2998499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
3008499736aSShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
3018499736aSShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
3028499736aSShri Abhyankar     v -= bs2;
3038499736aSShri Abhyankar 
3048499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
3058499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
3068499736aSShri Abhyankar     for (j=0;j>-nz;j--) {
3078499736aSShri Abhyankar       oidx = bs*vi[j];
3088499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3098499736aSShri Abhyankar       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3108499736aSShri Abhyankar       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3118499736aSShri Abhyankar       v  -= bs2;
3128499736aSShri Abhyankar     }
3138499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
3148499736aSShri Abhyankar     idx += bs;
3158499736aSShri Abhyankar   }
3168499736aSShri Abhyankar   /* backward solve the L^T */
3178499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
3188499736aSShri Abhyankar     v    = aa + bs2*ai[i];
3198499736aSShri Abhyankar     vi   = aj + ai[i];
3208499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
3218499736aSShri Abhyankar     idt  = bs*i;
3228499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
3238499736aSShri Abhyankar     for (j=0;j<nz;j++) {
3248499736aSShri Abhyankar       idx   = bs*vi[j];
3258499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3268499736aSShri Abhyankar       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3278499736aSShri Abhyankar       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3288499736aSShri Abhyankar       v += bs2;
3298499736aSShri Abhyankar     }
3308499736aSShri Abhyankar   }
3318499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3328499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3338499736aSShri Abhyankar   PetscFunctionReturn(0);
3348499736aSShri Abhyankar }
3358499736aSShri Abhyankar 
3368499736aSShri Abhyankar #undef __FUNCT__
33706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
33806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339f1af5d2fSBarry Smith {
340f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
341dfbe8321SBarry Smith   PetscErrorCode    ierr;
342b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
344b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
345b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
346f1af5d2fSBarry Smith 
347f1af5d2fSBarry Smith   PetscFunctionBegin;
348ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3491ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
350f1af5d2fSBarry Smith 
351f1af5d2fSBarry Smith   /* forward solve the U^T */
352f1af5d2fSBarry Smith   idx = 0;
353f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
354f1af5d2fSBarry Smith 
355f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
356f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
357ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
358f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
359f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
360f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
361f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362f1af5d2fSBarry Smith     v += 16;
363f1af5d2fSBarry Smith 
364f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
365f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
366f1af5d2fSBarry Smith     while (nz--) {
367f1af5d2fSBarry Smith       oidx = 4*(*vi++);
368f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
369f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
370f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372f1af5d2fSBarry Smith       v  += 16;
373f1af5d2fSBarry Smith     }
374f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375f1af5d2fSBarry Smith     idx += 4;
376f1af5d2fSBarry Smith   }
377f1af5d2fSBarry Smith   /* backward solve the L^T */
378f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
379f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
380f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
381f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
382f1af5d2fSBarry Smith     idt  = 4*i;
383f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384f1af5d2fSBarry Smith     while (nz--) {
385f1af5d2fSBarry Smith       idx   = 4*(*vi--);
386f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390f1af5d2fSBarry Smith       v -= 16;
391f1af5d2fSBarry Smith     }
392f1af5d2fSBarry Smith   }
3931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
394dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
395f1af5d2fSBarry Smith   PetscFunctionReturn(0);
396f1af5d2fSBarry Smith }
397f1af5d2fSBarry Smith 
3984a2ae208SSatish Balay #undef __FUNCT__
3994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
4004dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4018499736aSShri Abhyankar {
4028499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4038499736aSShri Abhyankar   PetscErrorCode    ierr;
404b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
4058499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
406b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
407b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
408b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
4098499736aSShri Abhyankar 
4108499736aSShri Abhyankar   PetscFunctionBegin;
4118499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4128499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4138499736aSShri Abhyankar 
4148499736aSShri Abhyankar   /* forward solve the U^T */
4158499736aSShri Abhyankar   idx = 0;
4168499736aSShri Abhyankar   for (i=0; i<n; i++) {
4178499736aSShri Abhyankar     v     = aa + bs2*diag[i];
4188499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
4198499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
4208499736aSShri Abhyankar     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
4218499736aSShri Abhyankar     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
4228499736aSShri Abhyankar     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
4238499736aSShri Abhyankar     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
4248499736aSShri Abhyankar     v -= bs2;
4258499736aSShri Abhyankar 
4268499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
4278499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
4288499736aSShri Abhyankar     for (j=0;j>-nz;j--) {
4298499736aSShri Abhyankar       oidx = bs*vi[j];
4308499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4318499736aSShri Abhyankar       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4328499736aSShri Abhyankar       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4338499736aSShri Abhyankar       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4348499736aSShri Abhyankar       v  -= bs2;
4358499736aSShri Abhyankar     }
4368499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
4378499736aSShri Abhyankar     idx += bs;
4388499736aSShri Abhyankar   }
4398499736aSShri Abhyankar   /* backward solve the L^T */
4408499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
4418499736aSShri Abhyankar     v    = aa + bs2*ai[i];
4428499736aSShri Abhyankar     vi   = aj + ai[i];
4438499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
4448499736aSShri Abhyankar     idt  = bs*i;
4458499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
4468499736aSShri Abhyankar     for (j=0;j<nz;j++) {
4478499736aSShri Abhyankar       idx   = bs*vi[j];
4488499736aSShri Abhyankar       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4498499736aSShri Abhyankar       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4508499736aSShri Abhyankar       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4518499736aSShri Abhyankar       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4528499736aSShri Abhyankar       v += bs2;
4538499736aSShri Abhyankar     }
4548499736aSShri Abhyankar   }
4558499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4568499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4578499736aSShri Abhyankar   PetscFunctionReturn(0);
4588499736aSShri Abhyankar }
4598499736aSShri Abhyankar 
4608499736aSShri Abhyankar #undef __FUNCT__
46106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
46206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463f1af5d2fSBarry Smith {
464f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
465dfbe8321SBarry Smith   PetscErrorCode    ierr;
466b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
468b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
469b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
470f1af5d2fSBarry Smith 
471f1af5d2fSBarry Smith   PetscFunctionBegin;
472ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474f1af5d2fSBarry Smith 
475f1af5d2fSBarry Smith   /* forward solve the U^T */
476f1af5d2fSBarry Smith   idx = 0;
477f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
478f1af5d2fSBarry Smith 
479f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
480f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
481ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
483f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
484f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487f1af5d2fSBarry Smith     v += 25;
488f1af5d2fSBarry Smith 
489f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
490f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
491f1af5d2fSBarry Smith     while (nz--) {
492f1af5d2fSBarry Smith       oidx = 5*(*vi++);
493f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
494f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
495f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498f1af5d2fSBarry Smith       v  += 25;
499f1af5d2fSBarry Smith     }
500f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501f1af5d2fSBarry Smith     idx += 5;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
505f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
506f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
508f1af5d2fSBarry Smith     idt  = 5*i;
509f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510f1af5d2fSBarry Smith     while (nz--) {
511f1af5d2fSBarry Smith       idx   = 5*(*vi--);
512f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
513f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
514f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517f1af5d2fSBarry Smith       v -= 25;
518f1af5d2fSBarry Smith     }
519f1af5d2fSBarry Smith   }
5201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
522f1af5d2fSBarry Smith   PetscFunctionReturn(0);
523f1af5d2fSBarry Smith }
524f1af5d2fSBarry Smith 
5254a2ae208SSatish Balay #undef __FUNCT__
5264dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
5274dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
5288499736aSShri Abhyankar {
5298499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
5308499736aSShri Abhyankar   PetscErrorCode ierr;
531b3260449SShri Abhyankar   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5328499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
533b3260449SShri Abhyankar   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
534b3260449SShri Abhyankar   const MatScalar      *aa=a->a,*v;
535b3260449SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
5368499736aSShri Abhyankar 
5378499736aSShri Abhyankar   PetscFunctionBegin;
5388499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
5398499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5408499736aSShri Abhyankar 
5418499736aSShri Abhyankar   /* forward solve the U^T */
5428499736aSShri Abhyankar   idx = 0;
5438499736aSShri Abhyankar   for (i=0; i<n; i++) {
5448499736aSShri Abhyankar     v     = aa + bs2*diag[i];
5458499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
5468499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
5478499736aSShri Abhyankar     x5 = x[4+idx];
5488499736aSShri Abhyankar     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
5498499736aSShri Abhyankar     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
5508499736aSShri Abhyankar     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
5518499736aSShri Abhyankar     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
5528499736aSShri Abhyankar     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
5538499736aSShri Abhyankar     v -= bs2;
5548499736aSShri Abhyankar 
5558499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
5568499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
5578499736aSShri Abhyankar     for (j=0;j>-nz;j--) {
5588499736aSShri Abhyankar       oidx = bs*vi[j];
5598499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5608499736aSShri Abhyankar       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5618499736aSShri Abhyankar       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5628499736aSShri Abhyankar       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5638499736aSShri Abhyankar       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5648499736aSShri Abhyankar       v  -= bs2;
5658499736aSShri Abhyankar     }
5668499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
5678499736aSShri Abhyankar     idx += bs;
5688499736aSShri Abhyankar   }
5698499736aSShri Abhyankar   /* backward solve the L^T */
5708499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
5718499736aSShri Abhyankar     v    = aa + bs2*ai[i];
5728499736aSShri Abhyankar     vi   = aj + ai[i];
5738499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
5748499736aSShri Abhyankar     idt  = bs*i;
5758499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
5768499736aSShri Abhyankar     for (j=0;j<nz;j++) {
5778499736aSShri Abhyankar       idx   = bs*vi[j];
5788499736aSShri Abhyankar       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5798499736aSShri Abhyankar       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5808499736aSShri Abhyankar       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5818499736aSShri Abhyankar       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5828499736aSShri Abhyankar       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5838499736aSShri Abhyankar       v += bs2;
5848499736aSShri Abhyankar     }
5858499736aSShri Abhyankar   }
5868499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5878499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5888499736aSShri Abhyankar   PetscFunctionReturn(0);
5898499736aSShri Abhyankar }
5908499736aSShri Abhyankar 
5918499736aSShri Abhyankar #undef __FUNCT__
59206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
59306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594f1af5d2fSBarry Smith {
595f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
596dfbe8321SBarry Smith   PetscErrorCode    ierr;
597b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
599b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
600b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
601f1af5d2fSBarry Smith 
602f1af5d2fSBarry Smith   PetscFunctionBegin;
603ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
6041ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
605f1af5d2fSBarry Smith 
606f1af5d2fSBarry Smith   /* forward solve the U^T */
607f1af5d2fSBarry Smith   idx = 0;
608f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
609f1af5d2fSBarry Smith 
610f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
611f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
612ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613ef66eb69SBarry Smith     x6    = x[5+idx];
614f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
615f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
616f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620f1af5d2fSBarry Smith     v += 36;
621f1af5d2fSBarry Smith 
622f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
623f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
624f1af5d2fSBarry Smith     while (nz--) {
625f1af5d2fSBarry Smith       oidx = 6*(*vi++);
626f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632f1af5d2fSBarry Smith       v  += 36;
633f1af5d2fSBarry Smith     }
634f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635f1af5d2fSBarry Smith     x[5+idx] = s6;
636f1af5d2fSBarry Smith     idx += 6;
637f1af5d2fSBarry Smith   }
638f1af5d2fSBarry Smith   /* backward solve the L^T */
639f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
640f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
641f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
642f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
643f1af5d2fSBarry Smith     idt  = 6*i;
644f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645f1af5d2fSBarry Smith     s6 = x[5+idt];
646f1af5d2fSBarry Smith     while (nz--) {
647f1af5d2fSBarry Smith       idx   = 6*(*vi--);
648f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
649f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
650f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654f1af5d2fSBarry Smith       v -= 36;
655f1af5d2fSBarry Smith     }
656f1af5d2fSBarry Smith   }
6571ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
658dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
659f1af5d2fSBarry Smith   PetscFunctionReturn(0);
660f1af5d2fSBarry Smith }
661f1af5d2fSBarry Smith 
6624a2ae208SSatish Balay #undef __FUNCT__
6634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
6644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
6658499736aSShri Abhyankar {
6668499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
6678499736aSShri Abhyankar   PetscErrorCode    ierr;
668b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
6698499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
670b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
671b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
672b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
6738499736aSShri Abhyankar 
6748499736aSShri Abhyankar   PetscFunctionBegin;
6758499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
6768499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
6778499736aSShri Abhyankar 
6788499736aSShri Abhyankar   /* forward solve the U^T */
6798499736aSShri Abhyankar   idx = 0;
6808499736aSShri Abhyankar   for (i=0; i<n; i++) {
6818499736aSShri Abhyankar     v     = aa + bs2*diag[i];
6828499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
6838499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
6848499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];
6858499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
6868499736aSShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
6878499736aSShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
6888499736aSShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
6898499736aSShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
6908499736aSShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
6918499736aSShri Abhyankar     v -= bs2;
6928499736aSShri Abhyankar 
6938499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
6948499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
6958499736aSShri Abhyankar     for (j=0;j>-nz;j--) {
6968499736aSShri Abhyankar       oidx = bs*vi[j];
6978499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
6988499736aSShri Abhyankar       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
6998499736aSShri Abhyankar       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7008499736aSShri Abhyankar       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7018499736aSShri Abhyankar       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7028499736aSShri Abhyankar       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7038499736aSShri Abhyankar       v  -= bs2;
7048499736aSShri Abhyankar     }
7058499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
7068499736aSShri Abhyankar     x[5+idx] = s6;
7078499736aSShri Abhyankar     idx += bs;
7088499736aSShri Abhyankar   }
7098499736aSShri Abhyankar   /* backward solve the L^T */
7108499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
7118499736aSShri Abhyankar     v    = aa + bs2*ai[i];
7128499736aSShri Abhyankar     vi   = aj + ai[i];
7138499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
7148499736aSShri Abhyankar     idt  = bs*i;
7158499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
7168499736aSShri Abhyankar     s6   = x[5+idt];
7178499736aSShri Abhyankar     for (j=0;j<nz;j++) {
7188499736aSShri Abhyankar       idx   = bs*vi[j];
7198499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
7208499736aSShri Abhyankar       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
7218499736aSShri Abhyankar       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7228499736aSShri Abhyankar       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7238499736aSShri Abhyankar       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7248499736aSShri Abhyankar       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7258499736aSShri Abhyankar       v += bs2;
7268499736aSShri Abhyankar     }
7278499736aSShri Abhyankar   }
7288499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
7298499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
7308499736aSShri Abhyankar   PetscFunctionReturn(0);
7318499736aSShri Abhyankar }
7328499736aSShri Abhyankar 
7338499736aSShri Abhyankar #undef __FUNCT__
73406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
73506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736f1af5d2fSBarry Smith {
737f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
738dfbe8321SBarry Smith   PetscErrorCode    ierr;
739b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
741b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
742b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
743f1af5d2fSBarry Smith 
744f1af5d2fSBarry Smith   PetscFunctionBegin;
745ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
7461ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith   /* forward solve the U^T */
749f1af5d2fSBarry Smith   idx = 0;
750f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
751f1af5d2fSBarry Smith 
752f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
753f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
754ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
756f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
757f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763f1af5d2fSBarry Smith     v += 49;
764f1af5d2fSBarry Smith 
765f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
766f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
767f1af5d2fSBarry Smith     while (nz--) {
768f1af5d2fSBarry Smith       oidx = 7*(*vi++);
769f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
770f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776f1af5d2fSBarry Smith       v  += 49;
777f1af5d2fSBarry Smith     }
778f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
780f1af5d2fSBarry Smith     idx += 7;
781f1af5d2fSBarry Smith   }
782f1af5d2fSBarry Smith   /* backward solve the L^T */
783f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
784f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
785f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
786f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
787f1af5d2fSBarry Smith     idt  = 7*i;
788f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
790f1af5d2fSBarry Smith     while (nz--) {
791f1af5d2fSBarry Smith       idx   = 7*(*vi--);
792f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
793f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799f1af5d2fSBarry Smith       v -= 49;
800f1af5d2fSBarry Smith     }
801f1af5d2fSBarry Smith   }
8021ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
803dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
804f1af5d2fSBarry Smith   PetscFunctionReturn(0);
805f1af5d2fSBarry Smith }
8068499736aSShri Abhyankar #undef __FUNCT__
8074dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
8084dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
8098499736aSShri Abhyankar {
8108499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
8118499736aSShri Abhyankar   PetscErrorCode    ierr;
812b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
8138499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
814b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
815b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
816b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
8178499736aSShri Abhyankar 
8188499736aSShri Abhyankar   PetscFunctionBegin;
8198499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
8208499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
8218499736aSShri Abhyankar 
8228499736aSShri Abhyankar   /* forward solve the U^T */
8238499736aSShri Abhyankar   idx = 0;
8248499736aSShri Abhyankar   for (i=0; i<n; i++) {
8258499736aSShri Abhyankar     v     = aa + bs2*diag[i];
8268499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
8278499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
8288499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
8298499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
8308499736aSShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
8318499736aSShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
8328499736aSShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
8338499736aSShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
8348499736aSShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
8358499736aSShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
8368499736aSShri Abhyankar     v -= bs2;
8378499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
8388499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
8398499736aSShri Abhyankar     for (j=0;j>-nz;j--) {
8408499736aSShri Abhyankar       oidx = bs*vi[j];
8418499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8428499736aSShri Abhyankar       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8438499736aSShri Abhyankar       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8448499736aSShri Abhyankar       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8458499736aSShri Abhyankar       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8468499736aSShri Abhyankar       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8478499736aSShri Abhyankar       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8488499736aSShri Abhyankar       v  -= bs2;
8498499736aSShri Abhyankar     }
8508499736aSShri Abhyankar     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
8518499736aSShri Abhyankar     x[5+idx] = s6;  x[6+idx] = s7;
8528499736aSShri Abhyankar     idx += bs;
8538499736aSShri Abhyankar   }
8548499736aSShri Abhyankar   /* backward solve the L^T */
8558499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
8568499736aSShri Abhyankar     v    = aa + bs2*ai[i];
8578499736aSShri Abhyankar     vi   = aj + ai[i];
8588499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
8598499736aSShri Abhyankar     idt  = bs*i;
8608499736aSShri Abhyankar     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
8618499736aSShri Abhyankar     s6   = x[5+idt];  s7 = x[6+idt];
8628499736aSShri Abhyankar     for (j=0;j<nz;j++) {
8638499736aSShri Abhyankar       idx   = bs*vi[j];
8648499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8658499736aSShri Abhyankar       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8668499736aSShri Abhyankar       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8678499736aSShri Abhyankar       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8688499736aSShri Abhyankar       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8698499736aSShri Abhyankar       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8708499736aSShri Abhyankar       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8718499736aSShri Abhyankar       v += bs2;
8728499736aSShri Abhyankar     }
8738499736aSShri Abhyankar   }
8748499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
8758499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
8768499736aSShri Abhyankar   PetscFunctionReturn(0);
8778499736aSShri Abhyankar }
878f1af5d2fSBarry Smith 
879f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
8804a2ae208SSatish Balay #undef __FUNCT__
88193fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
88293fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
88393fd935bSShri Abhyankar {
88493fd935bSShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
88593fd935bSShri Abhyankar   IS                iscol = a->col,isrow = a->row;
88693fd935bSShri Abhyankar   PetscErrorCode    ierr;
88793fd935bSShri Abhyankar   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
88893fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
88993fd935bSShri Abhyankar   PetscInt          nz;
89093fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
89193fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
89293fd935bSShri Abhyankar   const PetscScalar *b;
89393fd935bSShri Abhyankar 
89493fd935bSShri Abhyankar   PetscFunctionBegin;
8953649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
89693fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
89793fd935bSShri Abhyankar   tmp  = a->solve_work;
89893fd935bSShri Abhyankar 
89993fd935bSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
90093fd935bSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
90193fd935bSShri Abhyankar 
90293fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
90393fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[c[i]];
90493fd935bSShri Abhyankar 
90593fd935bSShri Abhyankar   /* forward solve the U^T */
90693fd935bSShri Abhyankar   for (i=0; i<n; i++) {
90793fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
90893fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
90993fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
91093fd935bSShri Abhyankar     s1  = tmp[i];
91193fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
91293fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
91393fd935bSShri Abhyankar     tmp[i] = s1;
91493fd935bSShri Abhyankar   }
91593fd935bSShri Abhyankar 
91693fd935bSShri Abhyankar   /* backward solve the L^T */
91793fd935bSShri Abhyankar   for (i=n-1; i>=0; i--) {
91893fd935bSShri Abhyankar     v   = aa + ai[i];
91993fd935bSShri Abhyankar     vi  = aj + ai[i];
92093fd935bSShri Abhyankar     nz  = ai[i+1] - ai[i];
92193fd935bSShri Abhyankar     s1  = tmp[i];
92293fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
92393fd935bSShri Abhyankar   }
92493fd935bSShri Abhyankar 
92593fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
92693fd935bSShri Abhyankar   for (i=0; i<n; i++) x[r[i]] = tmp[i];
92793fd935bSShri Abhyankar 
92893fd935bSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
92993fd935bSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9303649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
93193fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
93293fd935bSShri Abhyankar 
93393fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
93493fd935bSShri Abhyankar   PetscFunctionReturn(0);
93593fd935bSShri Abhyankar }
93693fd935bSShri Abhyankar 
93793fd935bSShri Abhyankar #undef __FUNCT__
93806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
93906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940f1af5d2fSBarry Smith {
941f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
942f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
9436849ba73SBarry Smith   PetscErrorCode    ierr;
9445d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
945b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946b3260449SShri Abhyankar   PetscInt          i,nz;
947b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
948b3260449SShri Abhyankar   PetscScalar       s1,*x,*t;
949b3260449SShri Abhyankar   const PetscScalar *b;
950f1af5d2fSBarry Smith 
951f1af5d2fSBarry Smith   PetscFunctionBegin;
9523649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
9531ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
954f1af5d2fSBarry Smith   t  = a->solve_work;
955f1af5d2fSBarry Smith 
956f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
957f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
958f1af5d2fSBarry Smith 
959f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
960f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
961f1af5d2fSBarry Smith     t[i] = b[c[i]];
962f1af5d2fSBarry Smith   }
963f1af5d2fSBarry Smith 
964f1af5d2fSBarry Smith   /* forward solve the U^T */
965f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
966f1af5d2fSBarry Smith 
967f1af5d2fSBarry Smith     v     = aa + diag[i];
968f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
969f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
970f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
971f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
972f1af5d2fSBarry Smith     while (nz--) {
973f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
974f1af5d2fSBarry Smith     }
975f1af5d2fSBarry Smith     t[i]   = s1;
976f1af5d2fSBarry Smith   }
977f1af5d2fSBarry Smith   /* backward solve the L^T */
978f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
979f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
980f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
981f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
982f1af5d2fSBarry Smith     s1   = t[i];
983f1af5d2fSBarry Smith     while (nz--) {
984f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
985f1af5d2fSBarry Smith     }
986f1af5d2fSBarry Smith   }
987f1af5d2fSBarry Smith 
988f1af5d2fSBarry Smith   /* copy t into x according to permutation */
989f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
990f1af5d2fSBarry Smith     x[r[i]]   = t[i];
991f1af5d2fSBarry Smith   }
992f1af5d2fSBarry Smith 
993f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
994f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9953649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
9961ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
997dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   PetscFunctionReturn(0);
999f1af5d2fSBarry Smith }
1000f1af5d2fSBarry Smith 
10014a2ae208SSatish Balay #undef __FUNCT__
100206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
100306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1004f1af5d2fSBarry Smith {
1005f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1006f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
10076849ba73SBarry Smith   PetscErrorCode    ierr;
10085d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1009b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1010b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1011b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1012b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1013b3260449SShri Abhyankar   const PetscScalar *b;
1014f1af5d2fSBarry Smith 
1015f1af5d2fSBarry Smith   PetscFunctionBegin;
10163649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
10171ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1018f1af5d2fSBarry Smith   t  = a->solve_work;
1019f1af5d2fSBarry Smith 
1020f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1021f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1022f1af5d2fSBarry Smith 
1023f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1024f1af5d2fSBarry Smith   ii = 0;
1025f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1026f1af5d2fSBarry Smith     ic      = 2*c[i];
1027f1af5d2fSBarry Smith     t[ii]   = b[ic];
1028f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1029f1af5d2fSBarry Smith     ii += 2;
1030f1af5d2fSBarry Smith   }
1031f1af5d2fSBarry Smith 
1032f1af5d2fSBarry Smith   /* forward solve the U^T */
1033f1af5d2fSBarry Smith   idx = 0;
1034f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1035f1af5d2fSBarry Smith 
1036f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
1037f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1038f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
1039f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
1040f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
1041f1af5d2fSBarry Smith     v += 4;
1042f1af5d2fSBarry Smith 
1043f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1044f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1045f1af5d2fSBarry Smith     while (nz--) {
1046f1af5d2fSBarry Smith       oidx = 2*(*vi++);
1047f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1048f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1049f1af5d2fSBarry Smith       v  += 4;
1050f1af5d2fSBarry Smith     }
1051f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1052f1af5d2fSBarry Smith     idx += 2;
1053f1af5d2fSBarry Smith   }
1054f1af5d2fSBarry Smith   /* backward solve the L^T */
1055f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1056f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
1057f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1058f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1059f1af5d2fSBarry Smith     idt  = 2*i;
1060f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1061f1af5d2fSBarry Smith     while (nz--) {
1062f1af5d2fSBarry Smith       idx   = 2*(*vi--);
1063f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1064f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1065f1af5d2fSBarry Smith       v -= 4;
1066f1af5d2fSBarry Smith     }
1067f1af5d2fSBarry Smith   }
1068f1af5d2fSBarry Smith 
1069f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1070f1af5d2fSBarry Smith   ii = 0;
1071f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1072f1af5d2fSBarry Smith     ir      = 2*r[i];
1073f1af5d2fSBarry Smith     x[ir]   = t[ii];
1074f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1075f1af5d2fSBarry Smith     ii += 2;
1076f1af5d2fSBarry Smith   }
1077f1af5d2fSBarry Smith 
1078f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1079f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
10803649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
10811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1082dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1083f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1084f1af5d2fSBarry Smith }
1085f1af5d2fSBarry Smith 
10864a2ae208SSatish Balay #undef __FUNCT__
10874dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
10884dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
108932121132SShri Abhyankar {
109032121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
109132121132SShri Abhyankar   PetscErrorCode    ierr;
109232121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1093b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
109432121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
109532121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1096b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1097b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1098b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1099b3260449SShri Abhyankar   const PetscScalar *b;
110032121132SShri Abhyankar 
110132121132SShri Abhyankar   PetscFunctionBegin;
11023649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
110332121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
110432121132SShri Abhyankar   t = a->solve_work;
110532121132SShri Abhyankar 
110632121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
110732121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
110832121132SShri Abhyankar 
110932121132SShri Abhyankar   /* copy b into temp work space according to permutation */
111032121132SShri Abhyankar   for (i=0;i<n;i++) {
111132121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
111232121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1];
111332121132SShri Abhyankar   }
111432121132SShri Abhyankar 
111532121132SShri Abhyankar   /* forward solve the U^T */
111632121132SShri Abhyankar   idx = 0;
111732121132SShri Abhyankar   for (i=0; i<n; i++) {
111832121132SShri Abhyankar     v     = aa + bs2*diag[i];
111932121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
112032121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx];
112132121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
112232121132SShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
112332121132SShri Abhyankar     v -= bs2;
112432121132SShri Abhyankar 
112532121132SShri Abhyankar     vi    = aj + diag[i] - 1;
112632121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
112732121132SShri Abhyankar     for (j=0;j>-nz;j--) {
112832121132SShri Abhyankar       oidx = bs*vi[j];
112932121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2;
113032121132SShri Abhyankar       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
113132121132SShri Abhyankar       v  -= bs2;
113232121132SShri Abhyankar     }
113332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
113432121132SShri Abhyankar     idx += bs;
113532121132SShri Abhyankar   }
113632121132SShri Abhyankar   /* backward solve the L^T */
113732121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
113832121132SShri Abhyankar     v    = aa + bs2*ai[i];
113932121132SShri Abhyankar     vi   = aj + ai[i];
114032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
114132121132SShri Abhyankar     idt  = bs*i;
114232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];
114332121132SShri Abhyankar     for (j=0;j<nz;j++) {
114432121132SShri Abhyankar       idx   = bs*vi[j];
114532121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2;
114632121132SShri Abhyankar       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
114732121132SShri Abhyankar       v += bs2;
114832121132SShri Abhyankar     }
114932121132SShri Abhyankar   }
115032121132SShri Abhyankar 
115132121132SShri Abhyankar   /* copy t into x according to permutation */
115232121132SShri Abhyankar   for (i=0;i<n;i++) {
115332121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
115432121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1];
115532121132SShri Abhyankar   }
115632121132SShri Abhyankar 
115732121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
115832121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11593649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
116032121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
116132121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
116232121132SShri Abhyankar   PetscFunctionReturn(0);
116332121132SShri Abhyankar }
116432121132SShri Abhyankar 
116532121132SShri Abhyankar #undef __FUNCT__
116606e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
116706e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1168f1af5d2fSBarry Smith {
1169f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1170f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
11716849ba73SBarry Smith   PetscErrorCode    ierr;
11725d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1173b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1174b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1175b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1176b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1177b3260449SShri Abhyankar   const PetscScalar *b;
1178f1af5d2fSBarry Smith 
1179f1af5d2fSBarry Smith   PetscFunctionBegin;
11803649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
11811ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1182f1af5d2fSBarry Smith   t  = a->solve_work;
1183f1af5d2fSBarry Smith 
1184f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1185f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1186f1af5d2fSBarry Smith 
1187f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1188f1af5d2fSBarry Smith   ii = 0;
1189f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1190f1af5d2fSBarry Smith     ic      = 3*c[i];
1191f1af5d2fSBarry Smith     t[ii]   = b[ic];
1192f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1193f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1194f1af5d2fSBarry Smith     ii += 3;
1195f1af5d2fSBarry Smith   }
1196f1af5d2fSBarry Smith 
1197f1af5d2fSBarry Smith   /* forward solve the U^T */
1198f1af5d2fSBarry Smith   idx = 0;
1199f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1200f1af5d2fSBarry Smith 
1201f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
1202f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1203f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1204f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1205f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1206f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1207f1af5d2fSBarry Smith     v += 9;
1208f1af5d2fSBarry Smith 
1209f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1210f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1211f1af5d2fSBarry Smith     while (nz--) {
1212f1af5d2fSBarry Smith       oidx = 3*(*vi++);
1213f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1214f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1215f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1216f1af5d2fSBarry Smith       v  += 9;
1217f1af5d2fSBarry Smith     }
1218f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1219f1af5d2fSBarry Smith     idx += 3;
1220f1af5d2fSBarry Smith   }
1221f1af5d2fSBarry Smith   /* backward solve the L^T */
1222f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1223f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
1224f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1225f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1226f1af5d2fSBarry Smith     idt  = 3*i;
1227f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1228f1af5d2fSBarry Smith     while (nz--) {
1229f1af5d2fSBarry Smith       idx   = 3*(*vi--);
1230f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1231f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1232f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233f1af5d2fSBarry Smith       v -= 9;
1234f1af5d2fSBarry Smith     }
1235f1af5d2fSBarry Smith   }
1236f1af5d2fSBarry Smith 
1237f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1238f1af5d2fSBarry Smith   ii = 0;
1239f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1240f1af5d2fSBarry Smith     ir      = 3*r[i];
1241f1af5d2fSBarry Smith     x[ir]   = t[ii];
1242f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1243f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1244f1af5d2fSBarry Smith     ii += 3;
1245f1af5d2fSBarry Smith   }
1246f1af5d2fSBarry Smith 
1247f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1248f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12493649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
12501ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1251dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1252f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1253f1af5d2fSBarry Smith }
1254f1af5d2fSBarry Smith 
12554a2ae208SSatish Balay #undef __FUNCT__
12564dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
12574dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
125832121132SShri Abhyankar {
125932121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
126032121132SShri Abhyankar   PetscErrorCode    ierr;
126132121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1262b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
126332121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
126432121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1265b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1266b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1267b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1268b3260449SShri Abhyankar   const PetscScalar *b;
126932121132SShri Abhyankar 
127032121132SShri Abhyankar   PetscFunctionBegin;
12713649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
127232121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
127332121132SShri Abhyankar   t = a->solve_work;
127432121132SShri Abhyankar 
127532121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
127632121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
127732121132SShri Abhyankar 
127832121132SShri Abhyankar   /* copy b into temp work space according to permutation */
127932121132SShri Abhyankar   for (i=0;i<n;i++) {
128032121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
128132121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
128232121132SShri Abhyankar   }
128332121132SShri Abhyankar 
128432121132SShri Abhyankar   /* forward solve the U^T */
128532121132SShri Abhyankar   idx = 0;
128632121132SShri Abhyankar   for (i=0; i<n; i++) {
128732121132SShri Abhyankar     v     = aa + bs2*diag[i];
128832121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
128932121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
129032121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
129132121132SShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
129232121132SShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
129332121132SShri Abhyankar     v -= bs2;
129432121132SShri Abhyankar 
129532121132SShri Abhyankar     vi    = aj + diag[i] - 1;
129632121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
129732121132SShri Abhyankar     for (j=0;j>-nz;j--) {
129832121132SShri Abhyankar       oidx = bs*vi[j];
129932121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
130032121132SShri Abhyankar       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
130132121132SShri Abhyankar       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
130232121132SShri Abhyankar       v  -= bs2;
130332121132SShri Abhyankar     }
130432121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
130532121132SShri Abhyankar     idx += bs;
130632121132SShri Abhyankar   }
130732121132SShri Abhyankar   /* backward solve the L^T */
130832121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
130932121132SShri Abhyankar     v    = aa + bs2*ai[i];
131032121132SShri Abhyankar     vi   = aj + ai[i];
131132121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
131232121132SShri Abhyankar     idt  = bs*i;
131332121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
131432121132SShri Abhyankar     for (j=0;j<nz;j++) {
131532121132SShri Abhyankar       idx   = bs*vi[j];
131632121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
131732121132SShri Abhyankar       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
131832121132SShri Abhyankar       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
131932121132SShri Abhyankar       v += bs2;
132032121132SShri Abhyankar     }
132132121132SShri Abhyankar   }
132232121132SShri Abhyankar 
132332121132SShri Abhyankar   /* copy t into x according to permutation */
132432121132SShri Abhyankar   for (i=0;i<n;i++) {
132532121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
132632121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
132732121132SShri Abhyankar   }
132832121132SShri Abhyankar 
132932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
133032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13313649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
133232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
133332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
133432121132SShri Abhyankar   PetscFunctionReturn(0);
133532121132SShri Abhyankar }
133632121132SShri Abhyankar 
133732121132SShri Abhyankar #undef __FUNCT__
133806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
133906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1340f1af5d2fSBarry Smith {
1341f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1342f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
13436849ba73SBarry Smith   PetscErrorCode    ierr;
13445d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1345b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1346b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1347b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1348b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1349b3260449SShri Abhyankar   const PetscScalar *b;
1350f1af5d2fSBarry Smith 
1351f1af5d2fSBarry Smith   PetscFunctionBegin;
13523649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
13531ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354f1af5d2fSBarry Smith   t  = a->solve_work;
1355f1af5d2fSBarry Smith 
1356f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1357f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1358f1af5d2fSBarry Smith 
1359f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1360f1af5d2fSBarry Smith   ii = 0;
1361f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1362f1af5d2fSBarry Smith     ic      = 4*c[i];
1363f1af5d2fSBarry Smith     t[ii]   = b[ic];
1364f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1365f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1366f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1367f1af5d2fSBarry Smith     ii += 4;
1368f1af5d2fSBarry Smith   }
1369f1af5d2fSBarry Smith 
1370f1af5d2fSBarry Smith   /* forward solve the U^T */
1371f1af5d2fSBarry Smith   idx = 0;
1372f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1373f1af5d2fSBarry Smith 
1374f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
1375f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1376f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1377f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1378f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1379f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1380f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1381f1af5d2fSBarry Smith     v += 16;
1382f1af5d2fSBarry Smith 
1383f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1384f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1385f1af5d2fSBarry Smith     while (nz--) {
1386f1af5d2fSBarry Smith       oidx = 4*(*vi++);
1387f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1388f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1389f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1390f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1391f1af5d2fSBarry Smith       v  += 16;
1392f1af5d2fSBarry Smith     }
1393f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1394f1af5d2fSBarry Smith     idx += 4;
1395f1af5d2fSBarry Smith   }
1396f1af5d2fSBarry Smith   /* backward solve the L^T */
1397f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1398f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
1399f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1400f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1401f1af5d2fSBarry Smith     idt  = 4*i;
1402f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1403f1af5d2fSBarry Smith     while (nz--) {
1404f1af5d2fSBarry Smith       idx   = 4*(*vi--);
1405f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1406f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1407f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1408f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1409f1af5d2fSBarry Smith       v -= 16;
1410f1af5d2fSBarry Smith     }
1411f1af5d2fSBarry Smith   }
1412f1af5d2fSBarry Smith 
1413f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1414f1af5d2fSBarry Smith   ii = 0;
1415f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1416f1af5d2fSBarry Smith     ir      = 4*r[i];
1417f1af5d2fSBarry Smith     x[ir]   = t[ii];
1418f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1419f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1420f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1421f1af5d2fSBarry Smith     ii += 4;
1422f1af5d2fSBarry Smith   }
1423f1af5d2fSBarry Smith 
1424f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1425f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
14263649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
14271ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1428dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1429f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1430f1af5d2fSBarry Smith }
1431f1af5d2fSBarry Smith 
14324a2ae208SSatish Balay #undef __FUNCT__
14334dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
14344dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
143532121132SShri Abhyankar {
143632121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
143732121132SShri Abhyankar   PetscErrorCode    ierr;
143832121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1439b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
144032121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
144132121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1442b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1443b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1444b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1445b3260449SShri Abhyankar   const PetscScalar *b;
144632121132SShri Abhyankar 
144732121132SShri Abhyankar   PetscFunctionBegin;
14483649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
144932121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
145032121132SShri Abhyankar   t = a->solve_work;
145132121132SShri Abhyankar 
145232121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
145332121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
145432121132SShri Abhyankar 
145532121132SShri Abhyankar   /* copy b into temp work space according to permutation */
145632121132SShri Abhyankar   for (i=0;i<n;i++) {
145732121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
145832121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
145932121132SShri Abhyankar   }
146032121132SShri Abhyankar 
146132121132SShri Abhyankar   /* forward solve the U^T */
146232121132SShri Abhyankar   idx = 0;
146332121132SShri Abhyankar   for (i=0; i<n; i++) {
146432121132SShri Abhyankar     v     = aa + bs2*diag[i];
146532121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
146632121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
146732121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
146832121132SShri Abhyankar     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
146932121132SShri Abhyankar     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
147032121132SShri Abhyankar     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
147132121132SShri Abhyankar     v -= bs2;
147232121132SShri Abhyankar 
147332121132SShri Abhyankar     vi    = aj + diag[i] - 1;
147432121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
147532121132SShri Abhyankar     for (j=0;j>-nz;j--) {
147632121132SShri Abhyankar       oidx = bs*vi[j];
147732121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
147832121132SShri Abhyankar       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
147932121132SShri Abhyankar       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
148032121132SShri Abhyankar       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
148132121132SShri Abhyankar       v  -= bs2;
148232121132SShri Abhyankar     }
148332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
148432121132SShri Abhyankar     idx += bs;
148532121132SShri Abhyankar   }
148632121132SShri Abhyankar   /* backward solve the L^T */
148732121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
148832121132SShri Abhyankar     v    = aa + bs2*ai[i];
148932121132SShri Abhyankar     vi   = aj + ai[i];
149032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
149132121132SShri Abhyankar     idt  = bs*i;
149232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
149332121132SShri Abhyankar     for (j=0;j<nz;j++) {
149432121132SShri Abhyankar       idx   = bs*vi[j];
149532121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
149632121132SShri Abhyankar       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
149732121132SShri Abhyankar       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
149832121132SShri Abhyankar       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
149932121132SShri Abhyankar       v += bs2;
150032121132SShri Abhyankar     }
150132121132SShri Abhyankar   }
150232121132SShri Abhyankar 
150332121132SShri Abhyankar   /* copy t into x according to permutation */
150432121132SShri Abhyankar   for (i=0;i<n;i++) {
150532121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
150632121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
150732121132SShri Abhyankar   }
150832121132SShri Abhyankar 
150932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
151032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
15113649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
151232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
151332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
151432121132SShri Abhyankar   PetscFunctionReturn(0);
151532121132SShri Abhyankar }
151632121132SShri Abhyankar 
151732121132SShri Abhyankar #undef __FUNCT__
151806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
151906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1520f1af5d2fSBarry Smith {
1521f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1522f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
15236849ba73SBarry Smith   PetscErrorCode    ierr;
15245d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1525b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1526b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1527b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1528b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1529b3260449SShri Abhyankar   const PetscScalar *b;
1530f1af5d2fSBarry Smith 
1531f1af5d2fSBarry Smith   PetscFunctionBegin;
15323649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
15331ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534f1af5d2fSBarry Smith   t  = a->solve_work;
1535f1af5d2fSBarry Smith 
1536f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1537f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1538f1af5d2fSBarry Smith 
1539f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1540f1af5d2fSBarry Smith   ii = 0;
1541f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1542f1af5d2fSBarry Smith     ic      = 5*c[i];
1543f1af5d2fSBarry Smith     t[ii]   = b[ic];
1544f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1545f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1546f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1547f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1548f1af5d2fSBarry Smith     ii += 5;
1549f1af5d2fSBarry Smith   }
1550f1af5d2fSBarry Smith 
1551f1af5d2fSBarry Smith   /* forward solve the U^T */
1552f1af5d2fSBarry Smith   idx = 0;
1553f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1554f1af5d2fSBarry Smith 
1555f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
1556f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1557f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1558f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1559f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1560f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1561f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1562f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1563f1af5d2fSBarry Smith     v += 25;
1564f1af5d2fSBarry Smith 
1565f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1566f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1567f1af5d2fSBarry Smith     while (nz--) {
1568f1af5d2fSBarry Smith       oidx = 5*(*vi++);
1569f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1570f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1571f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1572f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1573f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1574f1af5d2fSBarry Smith       v  += 25;
1575f1af5d2fSBarry Smith     }
1576f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1577f1af5d2fSBarry Smith     idx += 5;
1578f1af5d2fSBarry Smith   }
1579f1af5d2fSBarry Smith   /* backward solve the L^T */
1580f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1581f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
1582f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1583f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1584f1af5d2fSBarry Smith     idt  = 5*i;
1585f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1586f1af5d2fSBarry Smith     while (nz--) {
1587f1af5d2fSBarry Smith       idx   = 5*(*vi--);
1588f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1589f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1590f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1591f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1592f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1593f1af5d2fSBarry Smith       v -= 25;
1594f1af5d2fSBarry Smith     }
1595f1af5d2fSBarry Smith   }
1596f1af5d2fSBarry Smith 
1597f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1598f1af5d2fSBarry Smith   ii = 0;
1599f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1600f1af5d2fSBarry Smith     ir      = 5*r[i];
1601f1af5d2fSBarry Smith     x[ir]   = t[ii];
1602f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1603f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1604f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1605f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1606f1af5d2fSBarry Smith     ii += 5;
1607f1af5d2fSBarry Smith   }
1608f1af5d2fSBarry Smith 
1609f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1610f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
16113649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
16121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1613dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1614f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1615f1af5d2fSBarry Smith }
1616f1af5d2fSBarry Smith 
16174a2ae208SSatish Balay #undef __FUNCT__
16184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
16194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
162032121132SShri Abhyankar {
162132121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
162232121132SShri Abhyankar   PetscErrorCode    ierr;
162332121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1624b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
162532121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
162632121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1627b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1628b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1629b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1630b3260449SShri Abhyankar   const PetscScalar *b;
163132121132SShri Abhyankar 
163232121132SShri Abhyankar   PetscFunctionBegin;
16333649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
163432121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
163532121132SShri Abhyankar   t = a->solve_work;
163632121132SShri Abhyankar 
163732121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
163832121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
163932121132SShri Abhyankar 
164032121132SShri Abhyankar   /* copy b into temp work space according to permutation */
164132121132SShri Abhyankar   for (i=0;i<n;i++) {
164232121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
164332121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
164432121132SShri Abhyankar     t[ii+4] = b[ic+4];
164532121132SShri Abhyankar   }
164632121132SShri Abhyankar 
164732121132SShri Abhyankar   /* forward solve the U^T */
164832121132SShri Abhyankar   idx = 0;
164932121132SShri Abhyankar   for (i=0; i<n; i++) {
165032121132SShri Abhyankar     v     = aa + bs2*diag[i];
165132121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
165232121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
165332121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
165432121132SShri Abhyankar     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
165532121132SShri Abhyankar     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
165632121132SShri Abhyankar     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
165732121132SShri Abhyankar     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
165832121132SShri Abhyankar     v -= bs2;
165932121132SShri Abhyankar 
166032121132SShri Abhyankar     vi    = aj + diag[i] - 1;
166132121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
166232121132SShri Abhyankar     for (j=0;j>-nz;j--) {
166332121132SShri Abhyankar       oidx = bs*vi[j];
166432121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
166532121132SShri Abhyankar       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
166632121132SShri Abhyankar       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
166732121132SShri Abhyankar       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
166832121132SShri Abhyankar       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
166932121132SShri Abhyankar       v  -= bs2;
167032121132SShri Abhyankar     }
167132121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
167232121132SShri Abhyankar     idx += bs;
167332121132SShri Abhyankar   }
167432121132SShri Abhyankar   /* backward solve the L^T */
167532121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
167632121132SShri Abhyankar     v    = aa + bs2*ai[i];
167732121132SShri Abhyankar     vi   = aj + ai[i];
167832121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
167932121132SShri Abhyankar     idt  = bs*i;
168032121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
168132121132SShri Abhyankar     for (j=0;j<nz;j++) {
168232121132SShri Abhyankar       idx   = bs*vi[j];
168332121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
168432121132SShri Abhyankar       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
168532121132SShri Abhyankar       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
168632121132SShri Abhyankar       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
168732121132SShri Abhyankar       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
168832121132SShri Abhyankar       v += bs2;
168932121132SShri Abhyankar     }
169032121132SShri Abhyankar   }
169132121132SShri Abhyankar 
169232121132SShri Abhyankar   /* copy t into x according to permutation */
169332121132SShri Abhyankar   for (i=0;i<n;i++) {
169432121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
169532121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
169632121132SShri Abhyankar     x[ir+4] = t[ii+4];
169732121132SShri Abhyankar   }
169832121132SShri Abhyankar 
169932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
170032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
17013649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
170232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
170332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
170432121132SShri Abhyankar   PetscFunctionReturn(0);
170532121132SShri Abhyankar }
170632121132SShri Abhyankar 
170732121132SShri Abhyankar #undef __FUNCT__
170806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
170906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1710f1af5d2fSBarry Smith {
1711f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1712f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
17136849ba73SBarry Smith   PetscErrorCode    ierr;
17145d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1715b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1716b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1717b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1718b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1719b3260449SShri Abhyankar   const PetscScalar *b;
1720f1af5d2fSBarry Smith 
1721f1af5d2fSBarry Smith   PetscFunctionBegin;
17223649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
17231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1724f1af5d2fSBarry Smith   t  = a->solve_work;
1725f1af5d2fSBarry Smith 
1726f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1727f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1728f1af5d2fSBarry Smith 
1729f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1730f1af5d2fSBarry Smith   ii = 0;
1731f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1732f1af5d2fSBarry Smith     ic      = 6*c[i];
1733f1af5d2fSBarry Smith     t[ii]   = b[ic];
1734f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1735f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1736f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1737f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1738f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1739f1af5d2fSBarry Smith     ii += 6;
1740f1af5d2fSBarry Smith   }
1741f1af5d2fSBarry Smith 
1742f1af5d2fSBarry Smith   /* forward solve the U^T */
1743f1af5d2fSBarry Smith   idx = 0;
1744f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1745f1af5d2fSBarry Smith 
1746f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
1747f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1748f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1749f1af5d2fSBarry Smith     x6    = t[5+idx];
1750f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1751f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1752f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1753f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1754f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1755f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1756f1af5d2fSBarry Smith     v += 36;
1757f1af5d2fSBarry Smith 
1758f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1759f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1760f1af5d2fSBarry Smith     while (nz--) {
1761f1af5d2fSBarry Smith       oidx = 6*(*vi++);
1762f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1763f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1764f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1765f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1766f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1767f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1768f1af5d2fSBarry Smith       v  += 36;
1769f1af5d2fSBarry Smith     }
1770f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1771f1af5d2fSBarry Smith     t[5+idx] = s6;
1772f1af5d2fSBarry Smith     idx += 6;
1773f1af5d2fSBarry Smith   }
1774f1af5d2fSBarry Smith   /* backward solve the L^T */
1775f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1776f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
1777f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1778f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1779f1af5d2fSBarry Smith     idt  = 6*i;
1780f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1781f1af5d2fSBarry Smith     s6 = t[5+idt];
1782f1af5d2fSBarry Smith     while (nz--) {
1783f1af5d2fSBarry Smith       idx   = 6*(*vi--);
1784f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1785f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1786f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1787f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1788f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1789f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1790f1af5d2fSBarry Smith       v -= 36;
1791f1af5d2fSBarry Smith     }
1792f1af5d2fSBarry Smith   }
1793f1af5d2fSBarry Smith 
1794f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1795f1af5d2fSBarry Smith   ii = 0;
1796f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1797f1af5d2fSBarry Smith     ir      = 6*r[i];
1798f1af5d2fSBarry Smith     x[ir]   = t[ii];
1799f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1800f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1801f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1802f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1803f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1804f1af5d2fSBarry Smith     ii += 6;
1805f1af5d2fSBarry Smith   }
1806f1af5d2fSBarry Smith 
1807f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1808f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18093649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
18101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1811dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1812f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1813f1af5d2fSBarry Smith }
1814f1af5d2fSBarry Smith 
18154a2ae208SSatish Balay #undef __FUNCT__
18164dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
18174dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
181832121132SShri Abhyankar {
181932121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
182032121132SShri Abhyankar   PetscErrorCode    ierr;
182132121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1822b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
182332121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
182432121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1825b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1826b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1827b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1828b3260449SShri Abhyankar   const PetscScalar *b;
182932121132SShri Abhyankar 
183032121132SShri Abhyankar   PetscFunctionBegin;
18313649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
183232121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
183332121132SShri Abhyankar   t = a->solve_work;
183432121132SShri Abhyankar 
183532121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
183632121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
183732121132SShri Abhyankar 
183832121132SShri Abhyankar   /* copy b into temp work space according to permutation */
183932121132SShri Abhyankar   for (i=0;i<n;i++) {
184032121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
184132121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
184232121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
184332121132SShri Abhyankar   }
184432121132SShri Abhyankar 
184532121132SShri Abhyankar   /* forward solve the U^T */
184632121132SShri Abhyankar   idx = 0;
184732121132SShri Abhyankar   for (i=0; i<n; i++) {
184832121132SShri Abhyankar     v     = aa + bs2*diag[i];
184932121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
185032121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
185132121132SShri Abhyankar     x6    = t[5+idx];
185232121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
185332121132SShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
185432121132SShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
185532121132SShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
185632121132SShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
185732121132SShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
185832121132SShri Abhyankar     v -= bs2;
185932121132SShri Abhyankar 
186032121132SShri Abhyankar     vi    = aj + diag[i] - 1;
186132121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
186232121132SShri Abhyankar     for (j=0;j>-nz;j--) {
186332121132SShri Abhyankar       oidx = bs*vi[j];
186432121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
186532121132SShri Abhyankar       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
186632121132SShri Abhyankar       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
186732121132SShri Abhyankar       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
186832121132SShri Abhyankar       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
186932121132SShri Abhyankar       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
187032121132SShri Abhyankar       v  -= bs2;
187132121132SShri Abhyankar     }
187232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
187332121132SShri Abhyankar     t[5+idx] = s6;
187432121132SShri Abhyankar     idx += bs;
187532121132SShri Abhyankar   }
187632121132SShri Abhyankar   /* backward solve the L^T */
187732121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
187832121132SShri Abhyankar     v    = aa + bs2*ai[i];
187932121132SShri Abhyankar     vi   = aj + ai[i];
188032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
188132121132SShri Abhyankar     idt  = bs*i;
188232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
188332121132SShri Abhyankar     s6   = t[5+idt];
188432121132SShri Abhyankar    for (j=0;j<nz;j++) {
188532121132SShri Abhyankar       idx   = bs*vi[j];
188632121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
188732121132SShri Abhyankar       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
188832121132SShri Abhyankar       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
188932121132SShri Abhyankar       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
189032121132SShri Abhyankar       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
189132121132SShri Abhyankar       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
189232121132SShri Abhyankar       v += bs2;
189332121132SShri Abhyankar     }
189432121132SShri Abhyankar   }
189532121132SShri Abhyankar 
189632121132SShri Abhyankar   /* copy t into x according to permutation */
189732121132SShri Abhyankar   for (i=0;i<n;i++) {
189832121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
189932121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
190032121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
190132121132SShri Abhyankar   }
190232121132SShri Abhyankar 
190332121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
190432121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
19053649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
190632121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
190732121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
190832121132SShri Abhyankar   PetscFunctionReturn(0);
190932121132SShri Abhyankar }
191032121132SShri Abhyankar 
191132121132SShri Abhyankar #undef __FUNCT__
191206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
191306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1914f1af5d2fSBarry Smith {
1915f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1916f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
19176849ba73SBarry Smith   PetscErrorCode    ierr;
19185d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1919b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1920b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1921b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1922b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1923b3260449SShri Abhyankar   const PetscScalar *b;
1924f1af5d2fSBarry Smith 
1925f1af5d2fSBarry Smith   PetscFunctionBegin;
19263649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
19271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1928f1af5d2fSBarry Smith   t  = a->solve_work;
1929f1af5d2fSBarry Smith 
1930f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1931f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1932f1af5d2fSBarry Smith 
1933f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1934f1af5d2fSBarry Smith   ii = 0;
1935f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1936f1af5d2fSBarry Smith     ic      = 7*c[i];
1937f1af5d2fSBarry Smith     t[ii]   = b[ic];
1938f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1939f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1940f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1941f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1942f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1943f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1944f1af5d2fSBarry Smith     ii += 7;
1945f1af5d2fSBarry Smith   }
1946f1af5d2fSBarry Smith 
1947f1af5d2fSBarry Smith   /* forward solve the U^T */
1948f1af5d2fSBarry Smith   idx = 0;
1949f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1950f1af5d2fSBarry Smith 
1951f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1952f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1953f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1954f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1955f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1956f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1957f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1958f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1959f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1960f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1961f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1962f1af5d2fSBarry Smith     v += 49;
1963f1af5d2fSBarry Smith 
1964f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1965f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1966f1af5d2fSBarry Smith     while (nz--) {
1967f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1968f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1969f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1970f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1971f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1972f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1973f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1974f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1975f1af5d2fSBarry Smith       v  += 49;
1976f1af5d2fSBarry Smith     }
1977f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1978f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1979f1af5d2fSBarry Smith     idx += 7;
1980f1af5d2fSBarry Smith   }
1981f1af5d2fSBarry Smith   /* backward solve the L^T */
1982f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--) {
1983f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1984f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1985f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1986f1af5d2fSBarry Smith     idt  = 7*i;
1987f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1988f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1989f1af5d2fSBarry Smith     while (nz--) {
1990f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1991f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1992f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1993f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1994f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1995f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1996f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1997f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1998f1af5d2fSBarry Smith       v -= 49;
1999f1af5d2fSBarry Smith     }
2000f1af5d2fSBarry Smith   }
2001f1af5d2fSBarry Smith 
2002f1af5d2fSBarry Smith   /* copy t into x according to permutation */
2003f1af5d2fSBarry Smith   ii = 0;
2004f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
2005f1af5d2fSBarry Smith     ir      = 7*r[i];
2006f1af5d2fSBarry Smith     x[ir]   = t[ii];
2007f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
2008f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
2009f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
2010f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
2011f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
2012f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
2013f1af5d2fSBarry Smith     ii += 7;
2014f1af5d2fSBarry Smith   }
2015f1af5d2fSBarry Smith 
2016f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2017f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20183649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
20191ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2020dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2021f1af5d2fSBarry Smith   PetscFunctionReturn(0);
2022f1af5d2fSBarry Smith }
202332121132SShri Abhyankar #undef __FUNCT__
20244dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
20254dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
202632121132SShri Abhyankar {
202732121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
202832121132SShri Abhyankar   PetscErrorCode    ierr;
202932121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
2030b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
203132121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
203232121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2033b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2034b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2035b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2036b3260449SShri Abhyankar   const PetscScalar *b;
203732121132SShri Abhyankar 
203832121132SShri Abhyankar   PetscFunctionBegin;
20393649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
204032121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
204132121132SShri Abhyankar   t = a->solve_work;
204232121132SShri Abhyankar 
204332121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
204432121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
204532121132SShri Abhyankar 
204632121132SShri Abhyankar   /* copy b into temp work space according to permutation */
204732121132SShri Abhyankar   for (i=0;i<n;i++) {
204832121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
204932121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
205032121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
205132121132SShri Abhyankar   }
205232121132SShri Abhyankar 
205332121132SShri Abhyankar   /* forward solve the U^T */
205432121132SShri Abhyankar   idx = 0;
205532121132SShri Abhyankar   for (i=0; i<n; i++) {
205632121132SShri Abhyankar     v     = aa + bs2*diag[i];
205732121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
205832121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
205932121132SShri Abhyankar     x6    = t[5+idx]; x7 = t[6+idx];
206032121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
206132121132SShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
206232121132SShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
206332121132SShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
206432121132SShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
206532121132SShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
206632121132SShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
206732121132SShri Abhyankar     v -= bs2;
206832121132SShri Abhyankar 
206932121132SShri Abhyankar     vi    = aj + diag[i] - 1;
207032121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
207132121132SShri Abhyankar     for (j=0;j>-nz;j--) {
207232121132SShri Abhyankar       oidx = bs*vi[j];
207332121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
207432121132SShri Abhyankar       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
207532121132SShri Abhyankar       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
207632121132SShri Abhyankar       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
207732121132SShri Abhyankar       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
207832121132SShri Abhyankar       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
207932121132SShri Abhyankar       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
208032121132SShri Abhyankar       v  -= bs2;
208132121132SShri Abhyankar     }
208232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
208332121132SShri Abhyankar     t[5+idx] = s6;  t[6+idx] = s7;
208432121132SShri Abhyankar     idx += bs;
208532121132SShri Abhyankar   }
208632121132SShri Abhyankar   /* backward solve the L^T */
208732121132SShri Abhyankar   for (i=n-1; i>=0; i--) {
208832121132SShri Abhyankar     v    = aa + bs2*ai[i];
208932121132SShri Abhyankar     vi   = aj + ai[i];
209032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
209132121132SShri Abhyankar     idt  = bs*i;
209232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
209332121132SShri Abhyankar     s6   = t[5+idt];  s7 = t[6+idt];
209432121132SShri Abhyankar    for (j=0;j<nz;j++) {
209532121132SShri Abhyankar       idx   = bs*vi[j];
209632121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
209732121132SShri Abhyankar       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
209832121132SShri Abhyankar       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
209932121132SShri Abhyankar       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
210032121132SShri Abhyankar       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
210132121132SShri Abhyankar       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
210232121132SShri Abhyankar       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
210332121132SShri Abhyankar       v += bs2;
210432121132SShri Abhyankar     }
210532121132SShri Abhyankar   }
210632121132SShri Abhyankar 
210732121132SShri Abhyankar   /* copy t into x according to permutation */
210832121132SShri Abhyankar   for (i=0;i<n;i++) {
210932121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
211032121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
211132121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
211232121132SShri Abhyankar   }
211332121132SShri Abhyankar 
211432121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
211532121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21163649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
211732121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
211832121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
211932121132SShri Abhyankar   PetscFunctionReturn(0);
212032121132SShri Abhyankar }
2121f1af5d2fSBarry Smith 
21224e2b4712SSatish Balay /* ----------------------------------------------------------- */
21234a2ae208SSatish Balay #undef __FUNCT__
212406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
212506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21264e2b4712SSatish Balay {
21274e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21284e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
21296849ba73SBarry Smith   PetscErrorCode    ierr;
2130b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2131b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2132b3260449SShri Abhyankar   PetscInt          i,nz;
2133b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2134b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2135b3260449SShri Abhyankar   PetscScalar       *x,*s,*t,*ls;
2136b3260449SShri Abhyankar   const PetscScalar *b;
21374e2b4712SSatish Balay 
21384e2b4712SSatish Balay   PetscFunctionBegin;
21393649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2141f1af5d2fSBarry Smith   t  = a->solve_work;
21424e2b4712SSatish Balay 
21434e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21444e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
21454e2b4712SSatish Balay 
21464e2b4712SSatish Balay   /* forward solve the lower triangular */
214787828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21484e2b4712SSatish Balay   for (i=1; i<n; i++) {
21494e2b4712SSatish Balay     v   = aa + bs2*ai[i];
21504e2b4712SSatish Balay     vi  = aj + ai[i];
21514e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
2152f1af5d2fSBarry Smith     s = t + bs*i;
215387828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21544e2b4712SSatish Balay     while (nz--) {
215596b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
21564e2b4712SSatish Balay       v += bs2;
21574e2b4712SSatish Balay     }
21584e2b4712SSatish Balay   }
21594e2b4712SSatish Balay   /* backward solve the upper triangular */
2160d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
21614e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
21624e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
21634e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
21644e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
216587828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21664e2b4712SSatish Balay     while (nz--) {
216796b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
21684e2b4712SSatish Balay       v += bs2;
21694e2b4712SSatish Balay     }
217096b95a6bSBarry Smith     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
217187828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21724e2b4712SSatish Balay   }
21734e2b4712SSatish Balay 
21744e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21754e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21763649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
21771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2178dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
21794e2b4712SSatish Balay   PetscFunctionReturn(0);
21804e2b4712SSatish Balay }
21814e2b4712SSatish Balay 
21825c42ef9dSBarry Smith /* ----------------------------------------------------------- */
21835c42ef9dSBarry Smith #undef __FUNCT__
218406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
218506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21865c42ef9dSBarry Smith {
21875c42ef9dSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21885c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
21895c42ef9dSBarry Smith   PetscErrorCode    ierr;
21905c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2191b3260449SShri Abhyankar   PetscInt          i,nz,j;
2192b3260449SShri Abhyankar   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
21935c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
21945c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
21955c42ef9dSBarry Smith   const PetscScalar *b;
2196*6e111a19SKarl Rupp 
21975c42ef9dSBarry Smith   PetscFunctionBegin;
21983649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21995c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22005c42ef9dSBarry Smith   t    = a->solve_work;
22015c42ef9dSBarry Smith 
22025c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22035c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22045c42ef9dSBarry Smith 
22055c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
22065c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22075c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22085c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
22095c42ef9dSBarry Smith     }
22105c42ef9dSBarry Smith   }
22115c42ef9dSBarry Smith 
22125c42ef9dSBarry Smith 
22135c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
22145c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
22155c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22165c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
221796b95a6bSBarry Smith     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
22185c42ef9dSBarry Smith     v   = aa + bs2*(a->diag[i] + 1);
22195c42ef9dSBarry Smith     vi  = aj + a->diag[i] + 1;
22205c42ef9dSBarry Smith     nz  = ai[i+1] - a->diag[i] - 1;
22215c42ef9dSBarry Smith     while (nz--) {
222296b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22235c42ef9dSBarry Smith       v += bs2;
22245c42ef9dSBarry Smith     }
22255c42ef9dSBarry Smith   }
22265c42ef9dSBarry Smith 
22275c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
22285c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
22295c42ef9dSBarry Smith     v   = aa + bs2*ai[i];
22305c42ef9dSBarry Smith     vi  = aj + ai[i];
22315c42ef9dSBarry Smith     nz  = a->diag[i] - ai[i];
22325c42ef9dSBarry Smith     while (nz--) {
223396b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22345c42ef9dSBarry Smith       v += bs2;
22355c42ef9dSBarry Smith     }
22365c42ef9dSBarry Smith   }
22375c42ef9dSBarry Smith 
22385c42ef9dSBarry Smith   /* copy t into x according to permutation */
22395c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22405c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22415c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
22425c42ef9dSBarry Smith     }
22435c42ef9dSBarry Smith   }
22445c42ef9dSBarry Smith 
22455c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22465c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22473649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
22485c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22495c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
22505c42ef9dSBarry Smith   PetscFunctionReturn(0);
22515c42ef9dSBarry Smith }
22525c42ef9dSBarry Smith 
22534a2ae208SSatish Balay #undef __FUNCT__
22544dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
22554dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
22568499736aSShri Abhyankar {
22578499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
22588499736aSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
22598499736aSShri Abhyankar   PetscErrorCode    ierr;
2260b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2261b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2262b3260449SShri Abhyankar   PetscInt          i,j,nz;
2263b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
22648499736aSShri Abhyankar   const MatScalar   *aa=a->a,*v;
22658499736aSShri Abhyankar   PetscScalar       *x,*t,*ls;
22668499736aSShri Abhyankar   const PetscScalar *b;
2267b3260449SShri Abhyankar 
22688499736aSShri Abhyankar   PetscFunctionBegin;
22693649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
22708499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22718499736aSShri Abhyankar   t    = a->solve_work;
22728499736aSShri Abhyankar 
22738499736aSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22748499736aSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22758499736aSShri Abhyankar 
22768499736aSShri Abhyankar   /* copy the b into temp work space according to permutation */
22778499736aSShri Abhyankar   for (i=0; i<n; i++) {
22788499736aSShri Abhyankar     for (j=0; j<bs; j++) {
22798499736aSShri Abhyankar       t[i*bs+j] = b[c[i]*bs+j];
22808499736aSShri Abhyankar     }
22818499736aSShri Abhyankar   }
22828499736aSShri Abhyankar 
22838499736aSShri Abhyankar 
22848499736aSShri Abhyankar   /* forward solve the upper triangular transpose */
22858499736aSShri Abhyankar   ls = a->solve_work + A->cmap->n;
22868499736aSShri Abhyankar   for (i=0; i<n; i++) {
22878499736aSShri Abhyankar     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
228896b95a6bSBarry Smith     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
22898499736aSShri Abhyankar     v   = aa + bs2*(diag[i] - 1);
22908499736aSShri Abhyankar     vi  = aj + diag[i] - 1;
22918499736aSShri Abhyankar     nz  = diag[i] - diag[i+1] - 1;
22928499736aSShri Abhyankar     for (j=0;j>-nz;j--) {
229396b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
22948499736aSShri Abhyankar       v -= bs2;
22958499736aSShri Abhyankar     }
22968499736aSShri Abhyankar   }
22978499736aSShri Abhyankar 
22988499736aSShri Abhyankar   /* backward solve the lower triangular transpose */
22998499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
23008499736aSShri Abhyankar     v   = aa + bs2*ai[i];
23018499736aSShri Abhyankar     vi  = aj + ai[i];
23028499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
23038499736aSShri Abhyankar     for (j=0;j<nz;j++) {
230496b95a6bSBarry Smith       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
23058499736aSShri Abhyankar       v += bs2;
23068499736aSShri Abhyankar     }
23078499736aSShri Abhyankar   }
23088499736aSShri Abhyankar 
23098499736aSShri Abhyankar   /* copy t into x according to permutation */
23108499736aSShri Abhyankar   for (i=0; i<n; i++) {
23118499736aSShri Abhyankar     for (j=0; j<bs; j++) {
23128499736aSShri Abhyankar       x[bs*r[i]+j]   = t[bs*i+j];
23138499736aSShri Abhyankar     }
23148499736aSShri Abhyankar   }
23158499736aSShri Abhyankar 
23168499736aSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23178499736aSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23183649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
23198499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
23208499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
23218499736aSShri Abhyankar   PetscFunctionReturn(0);
23228499736aSShri Abhyankar }
23238499736aSShri Abhyankar 
2324832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
232529a97285SShri Abhyankar 
23262b0b2ea7SShri Abhyankar #undef __FUNCT__
2327832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2328832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
23292b0b2ea7SShri Abhyankar {
23302b0b2ea7SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
23312b0b2ea7SShri Abhyankar   PetscErrorCode    ierr;
2332b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
23330fa040f9SShri Abhyankar   PetscInt          i,nz,idx,idt,m;
23340b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
23352b0b2ea7SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
23362b0b2ea7SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
23370fa040f9SShri Abhyankar   PetscScalar       *x;
23380b68f018SBarry Smith   const PetscScalar *b;
23392b0b2ea7SShri Abhyankar 
23402b0b2ea7SShri Abhyankar   PetscFunctionBegin;
23413649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
23422b0b2ea7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23432b0b2ea7SShri Abhyankar 
23442b0b2ea7SShri Abhyankar   /* forward solve the lower triangular */
234529a97285SShri Abhyankar   idx    = 0;
23460fa040f9SShri Abhyankar   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
23470fa040f9SShri Abhyankar   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
23480fa040f9SShri Abhyankar   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
23492b0b2ea7SShri Abhyankar 
23502b0b2ea7SShri Abhyankar   for (i=1; i<n; i++) {
23512b0b2ea7SShri Abhyankar     v     = aa + bs2*ai[i];
23522b0b2ea7SShri Abhyankar     vi    = aj + ai[i];
23532b0b2ea7SShri Abhyankar     nz    = ai[i+1] - ai[i];
23540fa040f9SShri Abhyankar     idt   = bs*i;
23550fa040f9SShri Abhyankar     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
23560fa040f9SShri Abhyankar     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
23570fa040f9SShri Abhyankar     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
23582b0b2ea7SShri Abhyankar     for (m=0;m<nz;m++) {
23592b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
23600fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
23610fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
23620fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
23632b0b2ea7SShri Abhyankar 
23640b8f6341SShri Abhyankar 
23652b0b2ea7SShri Abhyankar       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
23662b0b2ea7SShri Abhyankar       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
23672b0b2ea7SShri Abhyankar       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
23682b0b2ea7SShri Abhyankar       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
23692b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
23702b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
23712b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
23722b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
23732b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
23742b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
23752b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
23762b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
23772b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
23782b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
23792b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
23802b0b2ea7SShri Abhyankar 
23812b0b2ea7SShri Abhyankar       v += bs2;
23822b0b2ea7SShri Abhyankar     }
23830fa040f9SShri Abhyankar     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
23840fa040f9SShri Abhyankar     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
23850fa040f9SShri Abhyankar     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
23862b0b2ea7SShri Abhyankar 
23872b0b2ea7SShri Abhyankar   }
23882b0b2ea7SShri Abhyankar   /* backward solve the upper triangular */
23892b0b2ea7SShri Abhyankar   for (i=n-1; i>=0; i--) {
23902b0b2ea7SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
23912b0b2ea7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
23922b0b2ea7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
23932b0b2ea7SShri Abhyankar     idt  = bs*i;
23940fa040f9SShri Abhyankar     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
23950fa040f9SShri Abhyankar     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
23960fa040f9SShri Abhyankar     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
23972b0b2ea7SShri Abhyankar 
23982b0b2ea7SShri Abhyankar     for (m=0;m<nz;m++) {
23992b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
24000fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
24010fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
24020fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
24032b0b2ea7SShri Abhyankar 
24042b0b2ea7SShri Abhyankar       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
24052b0b2ea7SShri Abhyankar       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
24062b0b2ea7SShri Abhyankar       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
24072b0b2ea7SShri Abhyankar       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
24082b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
24092b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
24102b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
24112b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
24122b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
24132b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
24142b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
24152b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
24162b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
24172b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
24182b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
24192b0b2ea7SShri Abhyankar 
24202b0b2ea7SShri Abhyankar       v += bs2;
24212b0b2ea7SShri Abhyankar     }
24222b0b2ea7SShri Abhyankar 
24230fa040f9SShri Abhyankar     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
24240fa040f9SShri Abhyankar     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
24250fa040f9SShri Abhyankar     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
24260fa040f9SShri Abhyankar     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
24270fa040f9SShri Abhyankar     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
24280fa040f9SShri Abhyankar     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
24290fa040f9SShri Abhyankar     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
24300fa040f9SShri Abhyankar     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
24310fa040f9SShri Abhyankar     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
24320fa040f9SShri Abhyankar     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
24330fa040f9SShri Abhyankar     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
24340fa040f9SShri Abhyankar     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
24350fa040f9SShri Abhyankar     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
24360fa040f9SShri Abhyankar     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
24370fa040f9SShri Abhyankar     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
24382b0b2ea7SShri Abhyankar 
24392b0b2ea7SShri Abhyankar   }
24402b0b2ea7SShri Abhyankar 
24413649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
24422b0b2ea7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24432b0b2ea7SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
24442b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
24452b0b2ea7SShri Abhyankar }
24462b0b2ea7SShri Abhyankar 
2447832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2448832cc040SShri Abhyankar /* Default MatSolve for block size 15 */
2449832cc040SShri Abhyankar 
24508499736aSShri Abhyankar #undef __FUNCT__
2451832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2452832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
24530b8f6341SShri Abhyankar {
24540b8f6341SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
24550b8f6341SShri Abhyankar   PetscErrorCode    ierr;
24560b8f6341SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
245753ef36baSBarry Smith   PetscInt          i,k,nz,idx,idt,m;
24580b8f6341SShri Abhyankar   const MatScalar   *aa=a->a,*v;
24590b8f6341SShri Abhyankar   PetscScalar       s[15];
246053ef36baSBarry Smith   PetscScalar       *x,xv;
24610b8f6341SShri Abhyankar   const PetscScalar *b;
24620b8f6341SShri Abhyankar 
24630b8f6341SShri Abhyankar   PetscFunctionBegin;
24643649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
24650b8f6341SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24660b8f6341SShri Abhyankar 
24670b8f6341SShri Abhyankar   /* forward solve the lower triangular */
2468832cc040SShri Abhyankar   for (i=0; i<n; i++) {
24690b8f6341SShri Abhyankar     v     = aa + bs2*ai[i];
24700b8f6341SShri Abhyankar     vi    = aj + ai[i];
24710b8f6341SShri Abhyankar     nz    = ai[i+1] - ai[i];
24720fa040f9SShri Abhyankar     idt   = bs*i;
2473832cc040SShri Abhyankar     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2474832cc040SShri Abhyankar     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2475832cc040SShri Abhyankar     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
24760b8f6341SShri Abhyankar     for (m=0;m<nz;m++) {
24770b8f6341SShri Abhyankar       idx   = bs*vi[m];
24780b8f6341SShri Abhyankar       for (k=0;k<15;k++) {
247953ef36baSBarry Smith         xv        = x[k + idx];
248053ef36baSBarry Smith         x[idt]    -= v[0]*xv;
248153ef36baSBarry Smith         x[1+idt]  -= v[1]*xv;
248253ef36baSBarry Smith         x[2+idt]  -= v[2]*xv;
248353ef36baSBarry Smith         x[3+idt]  -= v[3]*xv;
248453ef36baSBarry Smith         x[4+idt]  -= v[4]*xv;
248553ef36baSBarry Smith         x[5+idt]  -= v[5]*xv;
248653ef36baSBarry Smith         x[6+idt]  -= v[6]*xv;
248753ef36baSBarry Smith         x[7+idt]  -= v[7]*xv;
248853ef36baSBarry Smith         x[8+idt]  -= v[8]*xv;
248953ef36baSBarry Smith         x[9+idt]  -= v[9]*xv;
249053ef36baSBarry Smith         x[10+idt] -= v[10]*xv;
249153ef36baSBarry Smith         x[11+idt] -= v[11]*xv;
249253ef36baSBarry Smith         x[12+idt] -= v[12]*xv;
249353ef36baSBarry Smith         x[13+idt] -= v[13]*xv;
249453ef36baSBarry Smith         x[14+idt] -= v[14]*xv;
24950b8f6341SShri Abhyankar         v += 15;
24960b8f6341SShri Abhyankar       }
24970b8f6341SShri Abhyankar     }
24980b8f6341SShri Abhyankar   }
24990b8f6341SShri Abhyankar   /* backward solve the upper triangular */
25000b8f6341SShri Abhyankar   for (i=n-1; i>=0; i--) {
25010b8f6341SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
25020b8f6341SShri Abhyankar     vi   = aj + adiag[i+1]+1;
25030b8f6341SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
25040b8f6341SShri Abhyankar     idt  = bs*i;
25050fa040f9SShri Abhyankar     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
25060fa040f9SShri Abhyankar     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
25070fa040f9SShri Abhyankar     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
25080b8f6341SShri Abhyankar 
25090b8f6341SShri Abhyankar     for (m=0;m<nz;m++) {
25100b8f6341SShri Abhyankar       idx   = bs*vi[m];
25110b8f6341SShri Abhyankar       for (k=0;k<15;k++) {
251253ef36baSBarry Smith         xv = x[k + idx];
251353ef36baSBarry Smith         s[0]  -= v[0]*xv;
251453ef36baSBarry Smith         s[1]  -= v[1]*xv;
251553ef36baSBarry Smith         s[2]  -= v[2]*xv;
251653ef36baSBarry Smith         s[3]  -= v[3]*xv;
251753ef36baSBarry Smith         s[4]  -= v[4]*xv;
251853ef36baSBarry Smith         s[5]  -= v[5]*xv;
251953ef36baSBarry Smith         s[6]  -= v[6]*xv;
252053ef36baSBarry Smith         s[7]  -= v[7]*xv;
252153ef36baSBarry Smith         s[8]  -= v[8]*xv;
252253ef36baSBarry Smith         s[9]  -= v[9]*xv;
252353ef36baSBarry Smith         s[10] -= v[10]*xv;
252453ef36baSBarry Smith         s[11] -= v[11]*xv;
252553ef36baSBarry Smith         s[12] -= v[12]*xv;
252653ef36baSBarry Smith         s[13] -= v[13]*xv;
252753ef36baSBarry Smith         s[14] -= v[14]*xv;
25280b8f6341SShri Abhyankar         v += 15;
25290b8f6341SShri Abhyankar       }
25300b8f6341SShri Abhyankar     }
25310fa040f9SShri Abhyankar     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
25320b8f6341SShri Abhyankar     for (k=0;k<15;k++) {
25330fa040f9SShri Abhyankar       x[idt]    += v[0]*s[k];
25340fa040f9SShri Abhyankar       x[1+idt]  += v[1]*s[k];
25350fa040f9SShri Abhyankar       x[2+idt]  += v[2]*s[k];
25360fa040f9SShri Abhyankar       x[3+idt]  += v[3]*s[k];
25370fa040f9SShri Abhyankar       x[4+idt]  += v[4]*s[k];
25380fa040f9SShri Abhyankar       x[5+idt]  += v[5]*s[k];
25390fa040f9SShri Abhyankar       x[6+idt]  += v[6]*s[k];
25400fa040f9SShri Abhyankar       x[7+idt]  += v[7]*s[k];
25410fa040f9SShri Abhyankar       x[8+idt]  += v[8]*s[k];
25420fa040f9SShri Abhyankar       x[9+idt]  += v[9]*s[k];
25430fa040f9SShri Abhyankar       x[10+idt] += v[10]*s[k];
25440fa040f9SShri Abhyankar       x[11+idt] += v[11]*s[k];
25450fa040f9SShri Abhyankar       x[12+idt] += v[12]*s[k];
25460fa040f9SShri Abhyankar       x[13+idt] += v[13]*s[k];
25470fa040f9SShri Abhyankar       x[14+idt] += v[14]*s[k];
25480b8f6341SShri Abhyankar       v += 15;
25490b8f6341SShri Abhyankar     }
25500b8f6341SShri Abhyankar   }
25513649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
25520b8f6341SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
25530b8f6341SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
25540b8f6341SShri Abhyankar   PetscFunctionReturn(0);
25550b8f6341SShri Abhyankar }
25560b8f6341SShri Abhyankar 
25570b8f6341SShri Abhyankar 
25580b8f6341SShri Abhyankar #undef __FUNCT__
255906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
256006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
25614e2b4712SSatish Balay {
25624e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
25634e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
25646849ba73SBarry Smith   PetscErrorCode    ierr;
2565b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2566b3260449SShri Abhyankar   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2567b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2568b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2569b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2570b3260449SShri Abhyankar   const PetscScalar *b;
25714e2b4712SSatish Balay 
25724e2b4712SSatish Balay   PetscFunctionBegin;
25733649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
25741ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2575f1af5d2fSBarry Smith   t  = a->solve_work;
25764e2b4712SSatish Balay 
25774e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25784e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
25794e2b4712SSatish Balay 
25804e2b4712SSatish Balay   /* forward solve the lower triangular */
25814e2b4712SSatish Balay   idx    = 7*(*r++);
2582f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2583f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2584f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
25854e2b4712SSatish Balay 
25864e2b4712SSatish Balay   for (i=1; i<n; i++) {
25874e2b4712SSatish Balay     v     = aa + 49*ai[i];
25884e2b4712SSatish Balay     vi    = aj + ai[i];
25894e2b4712SSatish Balay     nz    = diag[i] - ai[i];
25904e2b4712SSatish Balay     idx   = 7*(*r++);
2591f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2592f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
25934e2b4712SSatish Balay     while (nz--) {
25944e2b4712SSatish Balay       idx   = 7*(*vi++);
2595f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2596f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2597f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
2598f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2599f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2600f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2601f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2602f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2603f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2604f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26054e2b4712SSatish Balay       v += 49;
26064e2b4712SSatish Balay     }
26074e2b4712SSatish Balay     idx = 7*i;
2608f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2609f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2610f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
26114e2b4712SSatish Balay   }
26124e2b4712SSatish Balay   /* backward solve the upper triangular */
26134e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
26144e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
26154e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
26164e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
26174e2b4712SSatish Balay     idt  = 7*i;
2618f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2619f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2620f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
26214e2b4712SSatish Balay     while (nz--) {
26224e2b4712SSatish Balay       idx   = 7*(*vi++);
2623f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2624f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2625f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
2626f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2627f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2628f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2629f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2630f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2631f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2632f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26334e2b4712SSatish Balay       v += 49;
26344e2b4712SSatish Balay     }
26354e2b4712SSatish Balay     idc = 7*(*c--);
26364e2b4712SSatish Balay     v   = aa + 49*diag[i];
2637f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2638f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2639f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2640f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2641f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2642f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2643f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2644f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2645f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2646f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2647f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2648f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2649f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2650f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
26514e2b4712SSatish Balay   }
26524e2b4712SSatish Balay 
26534e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
26544e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
26553649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
26561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2657dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
26584e2b4712SSatish Balay   PetscFunctionReturn(0);
26594e2b4712SSatish Balay }
26604e2b4712SSatish Balay 
26618f690400SShri Abhyankar #undef __FUNCT__
26624dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7"
26634dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
266435aa4fcfSShri Abhyankar {
266535aa4fcfSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
266635aa4fcfSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
266735aa4fcfSShri Abhyankar   PetscErrorCode    ierr;
2668b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2669b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2670b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
2671b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2672b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2673b3260449SShri Abhyankar   const PetscScalar *b;
267435aa4fcfSShri Abhyankar 
267535aa4fcfSShri Abhyankar   PetscFunctionBegin;
26763649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
267735aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
267835aa4fcfSShri Abhyankar   t  = a->solve_work;
267935aa4fcfSShri Abhyankar 
268035aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
268135aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
268235aa4fcfSShri Abhyankar 
268335aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
268435aa4fcfSShri Abhyankar   idx    = 7*r[0];
268535aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
268635aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
268735aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
268835aa4fcfSShri Abhyankar 
268935aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
269035aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
269135aa4fcfSShri Abhyankar     vi    = aj + ai[i];
269235aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
269335aa4fcfSShri Abhyankar     idx   = 7*r[i];
269435aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
269535aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
269635aa4fcfSShri Abhyankar     for (m=0;m<nz;m++) {
269735aa4fcfSShri Abhyankar       idx   = 7*vi[m];
269835aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
269935aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
270035aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
270135aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
270235aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
270335aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
270435aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
270535aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
270635aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
270735aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
270835aa4fcfSShri Abhyankar       v += 49;
270935aa4fcfSShri Abhyankar     }
271035aa4fcfSShri Abhyankar     idx = 7*i;
271135aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
271235aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
271335aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
271435aa4fcfSShri Abhyankar   }
271535aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
271635aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--) {
271735aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
271835aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
271935aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
272035aa4fcfSShri Abhyankar     idt  = 7*i;
272135aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
272235aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
272335aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
272435aa4fcfSShri Abhyankar     for (m=0;m<nz;m++) {
272535aa4fcfSShri Abhyankar       idx   = 7*vi[m];
272635aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
272735aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
272835aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
272935aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
273035aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
273135aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
273235aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
273335aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
273435aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
273535aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
273635aa4fcfSShri Abhyankar       v += 49;
273735aa4fcfSShri Abhyankar     }
273835aa4fcfSShri Abhyankar     idc = 7*c[i];
273935aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
274035aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
274135aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
274235aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
274335aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
274435aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
274535aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
274635aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
274735aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
274835aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
274935aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
275035aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
275135aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
275235aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
275335aa4fcfSShri Abhyankar   }
275435aa4fcfSShri Abhyankar 
275535aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
275635aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
27573649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
275835aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
275935aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
276035aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
276135aa4fcfSShri Abhyankar }
276235aa4fcfSShri Abhyankar 
276335aa4fcfSShri Abhyankar #undef __FUNCT__
276406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
276506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
276615091d37SBarry Smith {
276715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2768b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2769dfbe8321SBarry Smith   PetscErrorCode    ierr;
2770b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
2771d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2772d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2773d9fead3dSBarry Smith   const PetscScalar *b;
277415091d37SBarry Smith 
277515091d37SBarry Smith   PetscFunctionBegin;
27763649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
27771ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
277815091d37SBarry Smith   /* forward solve the lower triangular */
277915091d37SBarry Smith   idx    = 0;
278015091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
278115091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
278215091d37SBarry Smith   x[6] = b[6+idx];
278315091d37SBarry Smith   for (i=1; i<n; i++) {
278415091d37SBarry Smith     v     =  aa + 49*ai[i];
278515091d37SBarry Smith     vi    =  aj + ai[i];
278615091d37SBarry Smith     nz    =  diag[i] - ai[i];
278715091d37SBarry Smith     idx   =  7*i;
2788f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2789f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2790f1af5d2fSBarry Smith     s7  =  b[6+idx];
279115091d37SBarry Smith     while (nz--) {
279215091d37SBarry Smith       jdx   = 7*(*vi++);
279315091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
279415091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
279515091d37SBarry Smith       x7    = x[6+jdx];
2796f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2797f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2798f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2799f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2800f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2801f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2802f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
280315091d37SBarry Smith       v += 49;
280415091d37SBarry Smith      }
2805f1af5d2fSBarry Smith     x[idx]   = s1;
2806f1af5d2fSBarry Smith     x[1+idx] = s2;
2807f1af5d2fSBarry Smith     x[2+idx] = s3;
2808f1af5d2fSBarry Smith     x[3+idx] = s4;
2809f1af5d2fSBarry Smith     x[4+idx] = s5;
2810f1af5d2fSBarry Smith     x[5+idx] = s6;
2811f1af5d2fSBarry Smith     x[6+idx] = s7;
281215091d37SBarry Smith   }
281315091d37SBarry Smith   /* backward solve the upper triangular */
281415091d37SBarry Smith   for (i=n-1; i>=0; i--) {
281515091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
281615091d37SBarry Smith     vi   = aj + diag[i] + 1;
281715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
281815091d37SBarry Smith     idt  = 7*i;
2819f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2820f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2821f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
2822f1af5d2fSBarry Smith     s7 = x[6+idt];
282315091d37SBarry Smith     while (nz--) {
282415091d37SBarry Smith       idx   = 7*(*vi++);
282515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
282615091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
282715091d37SBarry Smith       x7    = x[6+idx];
2828f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2829f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2830f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2831f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2832f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2833f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2834f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
283515091d37SBarry Smith       v += 49;
283615091d37SBarry Smith     }
283715091d37SBarry Smith     v        = aa + 49*diag[i];
2838f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2839f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2840f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2841f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2842f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2843f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2844f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2845f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2846f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2847f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2848f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2849f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2850f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2851f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
285215091d37SBarry Smith   }
285315091d37SBarry Smith 
28543649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
28551ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2856dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
285715091d37SBarry Smith   PetscFunctionReturn(0);
285815091d37SBarry Smith }
285915091d37SBarry Smith 
2860cee9d6f2SShri Abhyankar #undef __FUNCT__
28614dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
28624dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
286353cca76cSShri Abhyankar {
286453cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2865b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
286653cca76cSShri Abhyankar     PetscErrorCode    ierr;
2867b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
2868b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
286953cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
287053cca76cSShri Abhyankar     PetscScalar       *x;
287153cca76cSShri Abhyankar     const PetscScalar *b;
287253cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
287353cca76cSShri Abhyankar 
287453cca76cSShri Abhyankar     PetscFunctionBegin;
28753649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
287653cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
287753cca76cSShri Abhyankar     /* forward solve the lower triangular */
287853cca76cSShri Abhyankar     idx    = 0;
287953cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
288053cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
288153cca76cSShri Abhyankar     for (i=1; i<n; i++) {
288253cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
288353cca76cSShri Abhyankar        vi   = aj + ai[i];
288453cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
288553cca76cSShri Abhyankar       idx   = bs*i;
288653cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
288753cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
288853cca76cSShri Abhyankar        for (k=0;k<nz;k++) {
288953cca76cSShri Abhyankar           jdx   = bs*vi[k];
289053cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
289153cca76cSShri Abhyankar           x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
289253cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
289353cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
289453cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
289553cca76cSShri Abhyankar           s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
289653cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
289753cca76cSShri Abhyankar           s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
289853cca76cSShri Abhyankar           s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
289953cca76cSShri Abhyankar           v   +=  bs2;
290053cca76cSShri Abhyankar         }
290153cca76cSShri Abhyankar 
290253cca76cSShri Abhyankar        x[idx]   = s1;
290353cca76cSShri Abhyankar        x[1+idx] = s2;
290453cca76cSShri Abhyankar        x[2+idx] = s3;
290553cca76cSShri Abhyankar        x[3+idx] = s4;
290653cca76cSShri Abhyankar        x[4+idx] = s5;
290753cca76cSShri Abhyankar        x[5+idx] = s6;
290853cca76cSShri Abhyankar        x[6+idx] = s7;
290953cca76cSShri Abhyankar     }
291053cca76cSShri Abhyankar 
291153cca76cSShri Abhyankar    /* backward solve the upper triangular */
291253cca76cSShri Abhyankar   for (i=n-1; i>=0; i--) {
291353cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
291453cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
291553cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
291653cca76cSShri Abhyankar      idt = bs*i;
291753cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
291853cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
291953cca76cSShri Abhyankar     for (k=0;k<nz;k++) {
292053cca76cSShri Abhyankar       idx   = bs*vi[k];
292153cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
292253cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
292353cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
292453cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
292553cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
292653cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
292753cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
292853cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
292953cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
293053cca76cSShri Abhyankar         v   +=  bs2;
293153cca76cSShri Abhyankar     }
293253cca76cSShri Abhyankar     /* x = inv_diagonal*x */
293353cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
293453cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
293553cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
293653cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
293753cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
293853cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
293953cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
294053cca76cSShri Abhyankar   }
294153cca76cSShri Abhyankar 
29423649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
294353cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
294453cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
294553cca76cSShri Abhyankar   PetscFunctionReturn(0);
294653cca76cSShri Abhyankar }
294753cca76cSShri Abhyankar 
294853cca76cSShri Abhyankar #undef __FUNCT__
294906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
295006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
295115091d37SBarry Smith {
295215091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
295315091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
29546849ba73SBarry Smith   PetscErrorCode    ierr;
29555d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
2956b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2957b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2958d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2959d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2960d9fead3dSBarry Smith   const PetscScalar *b;
2961b3260449SShri Abhyankar 
296215091d37SBarry Smith   PetscFunctionBegin;
29633649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
29641ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2965f1af5d2fSBarry Smith   t  = a->solve_work;
296615091d37SBarry Smith 
296715091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
296815091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
296915091d37SBarry Smith 
297015091d37SBarry Smith   /* forward solve the lower triangular */
297115091d37SBarry Smith   idx    = 6*(*r++);
2972f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2973f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
2974f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
297515091d37SBarry Smith   for (i=1; i<n; i++) {
297615091d37SBarry Smith     v     = aa + 36*ai[i];
297715091d37SBarry Smith     vi    = aj + ai[i];
297815091d37SBarry Smith     nz    = diag[i] - ai[i];
297915091d37SBarry Smith     idx   = 6*(*r++);
2980f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2981f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
298215091d37SBarry Smith     while (nz--) {
298315091d37SBarry Smith       idx   = 6*(*vi++);
2984f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2985f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2986f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2987f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2988f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2989f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2990f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2991f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
299215091d37SBarry Smith       v += 36;
299315091d37SBarry Smith     }
299415091d37SBarry Smith     idx = 6*i;
2995f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2996f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
2997f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
299815091d37SBarry Smith   }
299915091d37SBarry Smith   /* backward solve the upper triangular */
300015091d37SBarry Smith   for (i=n-1; i>=0; i--) {
300115091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
300215091d37SBarry Smith     vi   = aj + diag[i] + 1;
300315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
300415091d37SBarry Smith     idt  = 6*i;
3005f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3006f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
3007f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
300815091d37SBarry Smith     while (nz--) {
300915091d37SBarry Smith       idx   = 6*(*vi++);
3010f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3011f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3012f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
3013f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3014f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3015f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3016f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3017f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3018f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
301915091d37SBarry Smith       v += 36;
302015091d37SBarry Smith     }
302115091d37SBarry Smith     idc = 6*(*c--);
302215091d37SBarry Smith     v   = aa + 36*diag[i];
3023f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3024f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
3025f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3026f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
3027f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3028f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
3029f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3030f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
3031f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3032f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
3033f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3034f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
303515091d37SBarry Smith   }
303615091d37SBarry Smith 
303715091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
303815091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30393649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
30401ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3041dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
304215091d37SBarry Smith   PetscFunctionReturn(0);
304315091d37SBarry Smith }
304415091d37SBarry Smith 
30456506fda5SShri Abhyankar #undef __FUNCT__
30464dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6"
30474dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
30486506fda5SShri Abhyankar {
30496506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
30506506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
30516506fda5SShri Abhyankar   PetscErrorCode    ierr;
30526506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3053b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3054b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
30556506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
30566506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
30576506fda5SShri Abhyankar   const PetscScalar *b;
3058b3260449SShri Abhyankar 
30596506fda5SShri Abhyankar   PetscFunctionBegin;
30603649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
30616506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
30626506fda5SShri Abhyankar   t  = a->solve_work;
30636506fda5SShri Abhyankar 
30646506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
30656506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
30666506fda5SShri Abhyankar 
30676506fda5SShri Abhyankar   /* forward solve the lower triangular */
30686506fda5SShri Abhyankar   idx    = 6*r[0];
30696506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
30706506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
30716506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
30726506fda5SShri Abhyankar   for (i=1; i<n; i++) {
30736506fda5SShri Abhyankar     v     = aa + 36*ai[i];
30746506fda5SShri Abhyankar     vi    = aj + ai[i];
30756506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
30766506fda5SShri Abhyankar     idx   = 6*r[i];
30776506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
30786506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
30796506fda5SShri Abhyankar     for (m=0;m<nz;m++) {
30806506fda5SShri Abhyankar       idx   = 6*vi[m];
30816506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
30826506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
30836506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
30846506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
30856506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
30866506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
30876506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
30886506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
30896506fda5SShri Abhyankar       v += 36;
30906506fda5SShri Abhyankar     }
30916506fda5SShri Abhyankar     idx = 6*i;
30926506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
30936506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
30946506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
30956506fda5SShri Abhyankar   }
30966506fda5SShri Abhyankar   /* backward solve the upper triangular */
30976506fda5SShri Abhyankar   for (i=n-1; i>=0; i--) {
30986506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
30996506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
31006506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
31016506fda5SShri Abhyankar     idt  = 6*i;
31026506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
31036506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
31046506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
31056506fda5SShri Abhyankar     for (m=0;m<nz;m++) {
31066506fda5SShri Abhyankar       idx   = 6*vi[m];
31076506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
31086506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
31096506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
31106506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
31116506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
31126506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
31136506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
31146506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
31156506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
31166506fda5SShri Abhyankar       v += 36;
31176506fda5SShri Abhyankar     }
31186506fda5SShri Abhyankar     idc = 6*c[i];
31196506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
31206506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
31216506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
31226506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
31236506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
31246506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
31256506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
31266506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
31276506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
31286506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
31296506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
31306506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
31316506fda5SShri Abhyankar   }
31326506fda5SShri Abhyankar 
31336506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
31346506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
31353649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
31366506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
31376506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
31386506fda5SShri Abhyankar   PetscFunctionReturn(0);
31396506fda5SShri Abhyankar }
31408f690400SShri Abhyankar 
31418f690400SShri Abhyankar #undef __FUNCT__
314206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
314306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
314415091d37SBarry Smith {
314515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3146b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3147dfbe8321SBarry Smith   PetscErrorCode    ierr;
3148b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3149d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3150d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3151d9fead3dSBarry Smith   const PetscScalar *b;
315215091d37SBarry Smith 
315315091d37SBarry Smith   PetscFunctionBegin;
31543649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
31551ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
315615091d37SBarry Smith   /* forward solve the lower triangular */
315715091d37SBarry Smith   idx    = 0;
315815091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
315915091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
316015091d37SBarry Smith   for (i=1; i<n; i++) {
316115091d37SBarry Smith     v     =  aa + 36*ai[i];
316215091d37SBarry Smith     vi    =  aj + ai[i];
316315091d37SBarry Smith     nz    =  diag[i] - ai[i];
316415091d37SBarry Smith     idx   =  6*i;
3165f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3166f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
316715091d37SBarry Smith     while (nz--) {
316815091d37SBarry Smith       jdx   = 6*(*vi++);
316915091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
317015091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3171f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3172f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3173f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3174f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3175f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3176f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
317715091d37SBarry Smith       v += 36;
317815091d37SBarry Smith      }
3179f1af5d2fSBarry Smith     x[idx]   = s1;
3180f1af5d2fSBarry Smith     x[1+idx] = s2;
3181f1af5d2fSBarry Smith     x[2+idx] = s3;
3182f1af5d2fSBarry Smith     x[3+idx] = s4;
3183f1af5d2fSBarry Smith     x[4+idx] = s5;
3184f1af5d2fSBarry Smith     x[5+idx] = s6;
318515091d37SBarry Smith   }
318615091d37SBarry Smith   /* backward solve the upper triangular */
318715091d37SBarry Smith   for (i=n-1; i>=0; i--) {
318815091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
318915091d37SBarry Smith     vi   = aj + diag[i] + 1;
319015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
319115091d37SBarry Smith     idt  = 6*i;
3192f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
3193f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
3194f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
319515091d37SBarry Smith     while (nz--) {
319615091d37SBarry Smith       idx   = 6*(*vi++);
319715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
319815091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3199f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3200f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3201f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3202f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3203f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3204f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
320515091d37SBarry Smith       v += 36;
320615091d37SBarry Smith     }
320715091d37SBarry Smith     v        = aa + 36*diag[i];
3208f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3209f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3210f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3211f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3212f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3213f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
321415091d37SBarry Smith   }
321515091d37SBarry Smith 
32163649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
32171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3218dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
321915091d37SBarry Smith   PetscFunctionReturn(0);
322015091d37SBarry Smith }
322115091d37SBarry Smith 
3222cee9d6f2SShri Abhyankar #undef __FUNCT__
32234dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
32244dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
322553cca76cSShri Abhyankar {
322653cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3227b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
322853cca76cSShri Abhyankar     PetscErrorCode    ierr;
3229b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
3230b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
323153cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
323253cca76cSShri Abhyankar     PetscScalar       *x;
323353cca76cSShri Abhyankar     const PetscScalar *b;
323453cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
323553cca76cSShri Abhyankar 
323653cca76cSShri Abhyankar     PetscFunctionBegin;
32373649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
323853cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
323953cca76cSShri Abhyankar     /* forward solve the lower triangular */
324053cca76cSShri Abhyankar     idx    = 0;
324153cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
324253cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
324353cca76cSShri Abhyankar     for (i=1; i<n; i++) {
324453cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
324553cca76cSShri Abhyankar        vi   = aj + ai[i];
324653cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
324753cca76cSShri Abhyankar       idx   = bs*i;
324853cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
324953cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
325053cca76cSShri Abhyankar        for (k=0;k<nz;k++) {
325153cca76cSShri Abhyankar           jdx   = bs*vi[k];
325253cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
325353cca76cSShri Abhyankar           x5    = x[4+jdx]; x6 = x[5+jdx];
325453cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
325553cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
325653cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
325753cca76cSShri Abhyankar           s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
325853cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
325953cca76cSShri Abhyankar           s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
326053cca76cSShri Abhyankar           v   +=  bs2;
326153cca76cSShri Abhyankar         }
326253cca76cSShri Abhyankar 
326353cca76cSShri Abhyankar        x[idx]   = s1;
326453cca76cSShri Abhyankar        x[1+idx] = s2;
326553cca76cSShri Abhyankar        x[2+idx] = s3;
326653cca76cSShri Abhyankar        x[3+idx] = s4;
326753cca76cSShri Abhyankar        x[4+idx] = s5;
326853cca76cSShri Abhyankar        x[5+idx] = s6;
326953cca76cSShri Abhyankar     }
327053cca76cSShri Abhyankar 
327153cca76cSShri Abhyankar    /* backward solve the upper triangular */
327253cca76cSShri Abhyankar   for (i=n-1; i>=0; i--) {
327353cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
327453cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
327553cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
327653cca76cSShri Abhyankar      idt = bs*i;
327753cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
327853cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
327953cca76cSShri Abhyankar      for (k=0;k<nz;k++) {
328053cca76cSShri Abhyankar       idx   = bs*vi[k];
328153cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
328253cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
328353cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
328453cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
328553cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
328653cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
328753cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
328853cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
328953cca76cSShri Abhyankar         v   +=  bs2;
329053cca76cSShri Abhyankar     }
329153cca76cSShri Abhyankar     /* x = inv_diagonal*x */
329253cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
329353cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
329453cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
329553cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
329653cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
329753cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
329853cca76cSShri Abhyankar   }
329953cca76cSShri Abhyankar 
33003649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
330153cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
330253cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
330353cca76cSShri Abhyankar   PetscFunctionReturn(0);
330453cca76cSShri Abhyankar }
330553cca76cSShri Abhyankar 
330653cca76cSShri Abhyankar #undef __FUNCT__
330706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
330806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
33094e2b4712SSatish Balay {
33104e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
33114e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
33126849ba73SBarry Smith   PetscErrorCode    ierr;
33135d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3314b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3315b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
3316d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3317d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3318d9fead3dSBarry Smith   const PetscScalar *b;
33194e2b4712SSatish Balay 
33204e2b4712SSatish Balay   PetscFunctionBegin;
33213649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
33221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3323f1af5d2fSBarry Smith   t  = a->solve_work;
33244e2b4712SSatish Balay 
33254e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
33264e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
33274e2b4712SSatish Balay 
33284e2b4712SSatish Balay   /* forward solve the lower triangular */
33294e2b4712SSatish Balay   idx    = 5*(*r++);
3330f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3331f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
33324e2b4712SSatish Balay   for (i=1; i<n; i++) {
33334e2b4712SSatish Balay     v     = aa + 25*ai[i];
33344e2b4712SSatish Balay     vi    = aj + ai[i];
33354e2b4712SSatish Balay     nz    = diag[i] - ai[i];
33364e2b4712SSatish Balay     idx   = 5*(*r++);
3337f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3338f1af5d2fSBarry Smith     s5  = b[4+idx];
33394e2b4712SSatish Balay     while (nz--) {
33404e2b4712SSatish Balay       idx   = 5*(*vi++);
3341f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3342f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
3343f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3344f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3345f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3346f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3347f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
33484e2b4712SSatish Balay       v += 25;
33494e2b4712SSatish Balay     }
33504e2b4712SSatish Balay     idx = 5*i;
3351f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3352f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
33534e2b4712SSatish Balay   }
33544e2b4712SSatish Balay   /* backward solve the upper triangular */
33554e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
33564e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
33574e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
33584e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
33594e2b4712SSatish Balay     idt  = 5*i;
3360f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3361f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
33624e2b4712SSatish Balay     while (nz--) {
33634e2b4712SSatish Balay       idx   = 5*(*vi++);
3364f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3365f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3366f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3367f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3368f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3369f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3370f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
33714e2b4712SSatish Balay       v += 25;
33724e2b4712SSatish Balay     }
33734e2b4712SSatish Balay     idc = 5*(*c--);
33744e2b4712SSatish Balay     v   = aa + 25*diag[i];
3375f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3376f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
3377f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3378f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
3379f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3380f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
3381f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3382f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
3383f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3384f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
33854e2b4712SSatish Balay   }
33864e2b4712SSatish Balay 
33874e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
33884e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
33893649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
33901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3391dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
33924e2b4712SSatish Balay   PetscFunctionReturn(0);
33934e2b4712SSatish Balay }
33944e2b4712SSatish Balay 
339578bb4007SShri Abhyankar #undef __FUNCT__
33964dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5"
33974dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
339878bb4007SShri Abhyankar {
339978bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
340078bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
340178bb4007SShri Abhyankar   PetscErrorCode    ierr;
340278bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3403b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3404b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
340578bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
340678bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
340778bb4007SShri Abhyankar   const PetscScalar *b;
340878bb4007SShri Abhyankar 
340978bb4007SShri Abhyankar   PetscFunctionBegin;
34103649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
341178bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
341278bb4007SShri Abhyankar   t  = a->solve_work;
341378bb4007SShri Abhyankar 
341478bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
341578bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
341678bb4007SShri Abhyankar 
341778bb4007SShri Abhyankar   /* forward solve the lower triangular */
341878bb4007SShri Abhyankar   idx    = 5*r[0];
341978bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
342078bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
342178bb4007SShri Abhyankar   for (i=1; i<n; i++) {
342278bb4007SShri Abhyankar     v     = aa + 25*ai[i];
342378bb4007SShri Abhyankar     vi    = aj + ai[i];
342478bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
342578bb4007SShri Abhyankar     idx   = 5*r[i];
342678bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
342778bb4007SShri Abhyankar     s5  = b[4+idx];
342878bb4007SShri Abhyankar     for (m=0;m<nz;m++) {
342978bb4007SShri Abhyankar       idx   = 5*vi[m];
343078bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
343178bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
343278bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
343378bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
343478bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
343578bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
343678bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
343778bb4007SShri Abhyankar       v += 25;
343878bb4007SShri Abhyankar     }
343978bb4007SShri Abhyankar     idx = 5*i;
344078bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
344178bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
344278bb4007SShri Abhyankar   }
344378bb4007SShri Abhyankar   /* backward solve the upper triangular */
344478bb4007SShri Abhyankar   for (i=n-1; i>=0; i--) {
344578bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
344678bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
344778bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
344878bb4007SShri Abhyankar     idt  = 5*i;
344978bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
345078bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
345178bb4007SShri Abhyankar     for (m=0;m<nz;m++) {
345278bb4007SShri Abhyankar       idx   = 5*vi[m];
345378bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
345478bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
345578bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
345678bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
345778bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
345878bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
345978bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
346078bb4007SShri Abhyankar       v += 25;
346178bb4007SShri Abhyankar     }
346278bb4007SShri Abhyankar     idc = 5*c[i];
346378bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
346478bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
346578bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
346678bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
346778bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
346878bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
346978bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
347078bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
347178bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
347278bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
347378bb4007SShri Abhyankar   }
347478bb4007SShri Abhyankar 
347578bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
347678bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
34773649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
347878bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
347978bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
348078bb4007SShri Abhyankar   PetscFunctionReturn(0);
348178bb4007SShri Abhyankar }
348278bb4007SShri Abhyankar 
34838f690400SShri Abhyankar #undef __FUNCT__
348406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
348506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
348615091d37SBarry Smith {
348715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3488b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3489b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3490dfbe8321SBarry Smith   PetscErrorCode    ierr;
3491d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3492d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3493d9fead3dSBarry Smith   const PetscScalar *b;
349415091d37SBarry Smith 
349515091d37SBarry Smith   PetscFunctionBegin;
34963649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
34971ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
349815091d37SBarry Smith   /* forward solve the lower triangular */
349915091d37SBarry Smith   idx    = 0;
350015091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
350115091d37SBarry Smith   for (i=1; i<n; i++) {
350215091d37SBarry Smith     v     =  aa + 25*ai[i];
350315091d37SBarry Smith     vi    =  aj + ai[i];
350415091d37SBarry Smith     nz    =  diag[i] - ai[i];
350515091d37SBarry Smith     idx   =  5*i;
3506f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
350715091d37SBarry Smith     while (nz--) {
350815091d37SBarry Smith       jdx   = 5*(*vi++);
350915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3510f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3511f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3512f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3513f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3514f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
351515091d37SBarry Smith       v    += 25;
351615091d37SBarry Smith     }
3517f1af5d2fSBarry Smith     x[idx]   = s1;
3518f1af5d2fSBarry Smith     x[1+idx] = s2;
3519f1af5d2fSBarry Smith     x[2+idx] = s3;
3520f1af5d2fSBarry Smith     x[3+idx] = s4;
3521f1af5d2fSBarry Smith     x[4+idx] = s5;
352215091d37SBarry Smith   }
352315091d37SBarry Smith   /* backward solve the upper triangular */
352415091d37SBarry Smith   for (i=n-1; i>=0; i--) {
352515091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
352615091d37SBarry Smith     vi   = aj + diag[i] + 1;
352715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
352815091d37SBarry Smith     idt  = 5*i;
3529f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3530f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
353115091d37SBarry Smith     while (nz--) {
353215091d37SBarry Smith       idx   = 5*(*vi++);
353315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3534f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3535f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3536f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3537f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3538f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
353915091d37SBarry Smith       v    += 25;
354015091d37SBarry Smith     }
354115091d37SBarry Smith     v        = aa + 25*diag[i];
3542f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3543f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3544f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3545f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3546f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
354715091d37SBarry Smith   }
354815091d37SBarry Smith 
35493649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
35501ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3551dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
355215091d37SBarry Smith   PetscFunctionReturn(0);
355315091d37SBarry Smith }
355415091d37SBarry Smith 
3555cee9d6f2SShri Abhyankar #undef __FUNCT__
35564dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
35574dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
355853cca76cSShri Abhyankar {
355953cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3560b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3561b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,idt,jdx;
356253cca76cSShri Abhyankar   PetscErrorCode    ierr;
356353cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
356453cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
356553cca76cSShri Abhyankar   const PetscScalar *b;
356653cca76cSShri Abhyankar 
356753cca76cSShri Abhyankar   PetscFunctionBegin;
35683649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
356953cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
357053cca76cSShri Abhyankar   /* forward solve the lower triangular */
357153cca76cSShri Abhyankar   idx    = 0;
357253cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
357353cca76cSShri Abhyankar   for (i=1; i<n; i++) {
357453cca76cSShri Abhyankar     v   = aa + 25*ai[i];
357553cca76cSShri Abhyankar     vi  = aj + ai[i];
357653cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
357753cca76cSShri Abhyankar     idx = 5*i;
357853cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
357953cca76cSShri Abhyankar     for (k=0;k<nz;k++) {
358053cca76cSShri Abhyankar       jdx   = 5*vi[k];
358153cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
358253cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
358353cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
358453cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
358553cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
358653cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
358753cca76cSShri Abhyankar       v    += 25;
358853cca76cSShri Abhyankar     }
358953cca76cSShri Abhyankar     x[idx]   = s1;
359053cca76cSShri Abhyankar     x[1+idx] = s2;
359153cca76cSShri Abhyankar     x[2+idx] = s3;
359253cca76cSShri Abhyankar     x[3+idx] = s4;
359353cca76cSShri Abhyankar     x[4+idx] = s5;
359453cca76cSShri Abhyankar   }
359553cca76cSShri Abhyankar 
359653cca76cSShri Abhyankar   /* backward solve the upper triangular */
359753cca76cSShri Abhyankar   for (i=n-1; i>=0; i--) {
359853cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
359953cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
360053cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
360153cca76cSShri Abhyankar     idt = 5*i;
360253cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
360353cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360453cca76cSShri Abhyankar     for (k=0;k<nz;k++) {
360553cca76cSShri Abhyankar       idx   = 5*vi[k];
360653cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
360753cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
360853cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
360953cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
361053cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
361153cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
361253cca76cSShri Abhyankar       v    += 25;
361353cca76cSShri Abhyankar     }
361453cca76cSShri Abhyankar     /* x = inv_diagonal*x */
361553cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
361653cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
361753cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
361853cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
361953cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
362053cca76cSShri Abhyankar   }
362153cca76cSShri Abhyankar 
36223649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
362353cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
362453cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
362553cca76cSShri Abhyankar   PetscFunctionReturn(0);
362653cca76cSShri Abhyankar }
362753cca76cSShri Abhyankar 
362853cca76cSShri Abhyankar #undef __FUNCT__
362906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
363006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
36314e2b4712SSatish Balay {
36324e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
36334e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
36346849ba73SBarry Smith   PetscErrorCode    ierr;
3635b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3636b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
36375d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3638d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3639d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3640d9fead3dSBarry Smith   const PetscScalar *b;
36414e2b4712SSatish Balay 
36424e2b4712SSatish Balay   PetscFunctionBegin;
36433649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
36441ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3645f1af5d2fSBarry Smith   t  = a->solve_work;
36464e2b4712SSatish Balay 
36474e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
36484e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
36494e2b4712SSatish Balay 
36504e2b4712SSatish Balay   /* forward solve the lower triangular */
36514e2b4712SSatish Balay   idx    = 4*(*r++);
3652f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3653f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
36544e2b4712SSatish Balay   for (i=1; i<n; i++) {
36554e2b4712SSatish Balay     v     = aa + 16*ai[i];
36564e2b4712SSatish Balay     vi    = aj + ai[i];
36574e2b4712SSatish Balay     nz    = diag[i] - ai[i];
36584e2b4712SSatish Balay     idx   = 4*(*r++);
3659f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
36604e2b4712SSatish Balay     while (nz--) {
36614e2b4712SSatish Balay       idx   = 4*(*vi++);
3662f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3663f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3664f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3665f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3666f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
36674e2b4712SSatish Balay       v    += 16;
36684e2b4712SSatish Balay     }
36694e2b4712SSatish Balay     idx        = 4*i;
3670f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3671f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
36724e2b4712SSatish Balay   }
36734e2b4712SSatish Balay   /* backward solve the upper triangular */
36744e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
36754e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
36764e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
36774e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
36784e2b4712SSatish Balay     idt  = 4*i;
3679f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3680f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
36814e2b4712SSatish Balay     while (nz--) {
36824e2b4712SSatish Balay       idx   = 4*(*vi++);
3683f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3684f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3685f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3686f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3687f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3688f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
36894e2b4712SSatish Balay       v += 16;
36904e2b4712SSatish Balay     }
36914e2b4712SSatish Balay     idc      = 4*(*c--);
36924e2b4712SSatish Balay     v        = aa + 16*diag[i];
3693f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3694f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3695f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3696f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
36974e2b4712SSatish Balay   }
36984e2b4712SSatish Balay 
36994e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
37004e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
37013649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
37021ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3703dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
37044e2b4712SSatish Balay   PetscFunctionReturn(0);
37054e2b4712SSatish Balay }
3706f26ec98cSKris Buschelman 
37078f690400SShri Abhyankar #undef __FUNCT__
37084dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4"
37094dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
371078bb4007SShri Abhyankar {
371178bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
371278bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
371378bb4007SShri Abhyankar   PetscErrorCode    ierr;
3714b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3715b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
371678bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
371778bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
371878bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
371978bb4007SShri Abhyankar   const PetscScalar *b;
372078bb4007SShri Abhyankar 
372178bb4007SShri Abhyankar   PetscFunctionBegin;
37223649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
372378bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
372478bb4007SShri Abhyankar   t  = a->solve_work;
372578bb4007SShri Abhyankar 
372678bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
372778bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
372878bb4007SShri Abhyankar 
372978bb4007SShri Abhyankar   /* forward solve the lower triangular */
373078bb4007SShri Abhyankar   idx    = 4*r[0];
373178bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
373278bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
373378bb4007SShri Abhyankar   for (i=1; i<n; i++) {
373478bb4007SShri Abhyankar     v     = aa + 16*ai[i];
373578bb4007SShri Abhyankar     vi    = aj + ai[i];
373678bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
373778bb4007SShri Abhyankar     idx   = 4*r[i];
373878bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
373978bb4007SShri Abhyankar     for (m=0;m<nz;m++) {
374078bb4007SShri Abhyankar       idx   = 4*vi[m];
374178bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
374278bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
374378bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
374478bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
374578bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
374678bb4007SShri Abhyankar       v    += 16;
374778bb4007SShri Abhyankar     }
374878bb4007SShri Abhyankar     idx        = 4*i;
374978bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
375078bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
375178bb4007SShri Abhyankar   }
375278bb4007SShri Abhyankar   /* backward solve the upper triangular */
375378bb4007SShri Abhyankar   for (i=n-1; i>=0; i--) {
375478bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
375578bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
375678bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
375778bb4007SShri Abhyankar     idt  = 4*i;
375878bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
375978bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
376078bb4007SShri Abhyankar     for (m=0;m<nz;m++) {
376178bb4007SShri Abhyankar       idx   = 4*vi[m];
376278bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
376378bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
376478bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
376578bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
376678bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
376778bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
376878bb4007SShri Abhyankar       v += 16;
376978bb4007SShri Abhyankar     }
377078bb4007SShri Abhyankar     idc      = 4*c[i];
377178bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
377278bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
377378bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
377478bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
377578bb4007SShri Abhyankar   }
377678bb4007SShri Abhyankar 
377778bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
377878bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
37793649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
378078bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
378178bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
378278bb4007SShri Abhyankar   PetscFunctionReturn(0);
378378bb4007SShri Abhyankar }
378478bb4007SShri Abhyankar 
378578bb4007SShri Abhyankar #undef __FUNCT__
3786f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3787dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3788f26ec98cSKris Buschelman {
3789f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3790f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
37916849ba73SBarry Smith   PetscErrorCode    ierr;
3792b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3793b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
37945d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3795d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3796d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3797d9fead3dSBarry Smith   PetscScalar       *x;
3798d9fead3dSBarry Smith   const PetscScalar *b;
3799f26ec98cSKris Buschelman 
3800f26ec98cSKris Buschelman   PetscFunctionBegin;
38013649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
38021ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3803f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
3804f26ec98cSKris Buschelman 
3805f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3806f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3807f26ec98cSKris Buschelman 
3808f26ec98cSKris Buschelman   /* forward solve the lower triangular */
3809f26ec98cSKris Buschelman   idx    = 4*(*r++);
3810f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
3811f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
3812f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
3813f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
3814f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
3815f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
3816f26ec98cSKris Buschelman     vi    = aj + ai[i];
3817f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
3818f26ec98cSKris Buschelman     idx   = 4*(*r++);
3819f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
3820f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
3821f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
3822f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
3823f26ec98cSKris Buschelman     while (nz--) {
3824f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3825f26ec98cSKris Buschelman       x1  = t[idx];
3826f26ec98cSKris Buschelman       x2  = t[1+idx];
3827f26ec98cSKris Buschelman       x3  = t[2+idx];
3828f26ec98cSKris Buschelman       x4  = t[3+idx];
3829f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3830f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3831f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3832f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3833f26ec98cSKris Buschelman       v    += 16;
3834f26ec98cSKris Buschelman     }
3835f26ec98cSKris Buschelman     idx        = 4*i;
3836f26ec98cSKris Buschelman     t[idx]   = s1;
3837f26ec98cSKris Buschelman     t[1+idx] = s2;
3838f26ec98cSKris Buschelman     t[2+idx] = s3;
3839f26ec98cSKris Buschelman     t[3+idx] = s4;
3840f26ec98cSKris Buschelman   }
3841f26ec98cSKris Buschelman   /* backward solve the upper triangular */
3842f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--) {
3843f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
3844f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
3845f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
3846f26ec98cSKris Buschelman     idt  = 4*i;
3847f26ec98cSKris Buschelman     s1 = t[idt];
3848f26ec98cSKris Buschelman     s2 = t[1+idt];
3849f26ec98cSKris Buschelman     s3 = t[2+idt];
3850f26ec98cSKris Buschelman     s4 = t[3+idt];
3851f26ec98cSKris Buschelman     while (nz--) {
3852f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3853f26ec98cSKris Buschelman       x1  = t[idx];
3854f26ec98cSKris Buschelman       x2  = t[1+idx];
3855f26ec98cSKris Buschelman       x3  = t[2+idx];
3856f26ec98cSKris Buschelman       x4  = t[3+idx];
3857f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3858f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3859f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3860f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3861f26ec98cSKris Buschelman       v += 16;
3862f26ec98cSKris Buschelman     }
3863f26ec98cSKris Buschelman     idc      = 4*(*c--);
3864f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3865f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3866f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3867f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3868f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3869f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3870f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3871f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3872f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3873f26ec98cSKris Buschelman  }
3874f26ec98cSKris Buschelman 
3875f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3876f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
38773649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
38781ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3879dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3880f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3881f26ec98cSKris Buschelman }
3882f26ec98cSKris Buschelman 
388324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
388424c233c2SKris Buschelman 
388524c233c2SKris Buschelman #include PETSC_HAVE_SSE
388624c233c2SKris Buschelman 
388724c233c2SKris Buschelman #undef __FUNCT__
388824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3889dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
389024c233c2SKris Buschelman {
389124c233c2SKris Buschelman   /*
389224c233c2SKris Buschelman      Note: This code uses demotion of double
389324c233c2SKris Buschelman      to float when performing the mixed-mode computation.
389424c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
389524c233c2SKris Buschelman   */
389624c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
389724c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
38986849ba73SBarry Smith   PetscErrorCode ierr;
38995d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
39005d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
390124c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
390287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
390324c233c2SKris Buschelman 
390424c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
390524c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
390624c233c2SKris Buschelman   unsigned long   offset;
390724c233c2SKris Buschelman 
390824c233c2SKris Buschelman   PetscFunctionBegin;
390924c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
391024c233c2SKris Buschelman 
391124c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
391224c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
391324c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
391424c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
391524c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
391624c233c2SKris Buschelman 
39171ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
39181ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
391924c233c2SKris Buschelman     t  = a->solve_work;
392024c233c2SKris Buschelman 
392124c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
392224c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
392324c233c2SKris Buschelman 
392424c233c2SKris Buschelman     /* forward solve the lower triangular */
392524c233c2SKris Buschelman     idx  = 4*(*r++);
392624c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
392724c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
392824c233c2SKris Buschelman     v    =  aa + 16*ai[1];
392924c233c2SKris Buschelman 
393024c233c2SKris Buschelman     for (i=1; i<n;) {
393124c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
393224c233c2SKris Buschelman       vi   =  aj      + ai[i];
393324c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
393424c233c2SKris Buschelman       idx  =  4*(*r++);
393524c233c2SKris Buschelman 
393624c233c2SKris Buschelman       /* Demote sum from double to float */
393724c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
393824c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
393924c233c2SKris Buschelman 
394024c233c2SKris Buschelman       while (nz--) {
394124c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
394224c233c2SKris Buschelman         idx = 4*(*vi++);
394324c233c2SKris Buschelman 
394424c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
394524c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
394624c233c2SKris Buschelman 
394724c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
394824c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
394924c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
395024c233c2SKris Buschelman 
395124c233c2SKris Buschelman           /* First Column */
395224c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
395324c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
395424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
395524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
395624c233c2SKris Buschelman 
395724c233c2SKris Buschelman           /* Second Column */
395824c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
395924c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
396024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
396124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
396224c233c2SKris Buschelman 
396324c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
396424c233c2SKris Buschelman 
396524c233c2SKris Buschelman           /* Third Column */
396624c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
396724c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
396824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
396924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
397024c233c2SKris Buschelman 
397124c233c2SKris Buschelman           /* Fourth Column */
397224c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
397324c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
397424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
397524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
397624c233c2SKris Buschelman         SSE_INLINE_END_2
397724c233c2SKris Buschelman 
397824c233c2SKris Buschelman         v  += 16;
397924c233c2SKris Buschelman       }
398024c233c2SKris Buschelman       idx = 4*i;
398124c233c2SKris Buschelman       v   = aa + 16*ai[++i];
398224c233c2SKris Buschelman       PREFETCH_NTA(v);
398324c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
398424c233c2SKris Buschelman 
398524c233c2SKris Buschelman       /* Promote result from float to double */
398624c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
398724c233c2SKris Buschelman     }
398824c233c2SKris Buschelman     /* backward solve the upper triangular */
398924c233c2SKris Buschelman     idt  = 4*(n-1);
399024c233c2SKris Buschelman     ai16 = 16*diag[n-1];
399124c233c2SKris Buschelman     v    = aa + ai16 + 16;
399224c233c2SKris Buschelman     for (i=n-1; i>=0;) {
399324c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
399424c233c2SKris Buschelman       vi = aj + diag[i] + 1;
399524c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
399624c233c2SKris Buschelman 
399724c233c2SKris Buschelman       /* Demote accumulator from double to float */
399824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
399924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
400024c233c2SKris Buschelman 
400124c233c2SKris Buschelman       while (nz--) {
400224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
400324c233c2SKris Buschelman         idx = 4*(*vi++);
400424c233c2SKris Buschelman 
400524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
400624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
400724c233c2SKris Buschelman 
400824c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
400924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
401024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
401124c233c2SKris Buschelman 
401224c233c2SKris Buschelman           /* First Column */
401324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
401424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
401524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
401624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
401724c233c2SKris Buschelman 
401824c233c2SKris Buschelman           /* Second Column */
401924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
402024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
402124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
402224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
402324c233c2SKris Buschelman 
402424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
402524c233c2SKris Buschelman 
402624c233c2SKris Buschelman           /* Third Column */
402724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
402824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
402924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
403024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
403124c233c2SKris Buschelman 
403224c233c2SKris Buschelman           /* Fourth Column */
403324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
403424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
403524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
403624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
403724c233c2SKris Buschelman         SSE_INLINE_END_2
403824c233c2SKris Buschelman         v  += 16;
403924c233c2SKris Buschelman       }
404024c233c2SKris Buschelman       v    = aa + ai16;
404124c233c2SKris Buschelman       ai16 = 16*diag[--i];
404224c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
404324c233c2SKris Buschelman       /*
404424c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
404524c233c2SKris Buschelman          which was inverted as part of the factorization
404624c233c2SKris Buschelman       */
404724c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
404824c233c2SKris Buschelman         /* First Column */
404924c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
405024c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
405124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
405224c233c2SKris Buschelman 
405324c233c2SKris Buschelman         /* Second Column */
405424c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
405524c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
405624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
405724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
405824c233c2SKris Buschelman 
405924c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
406024c233c2SKris Buschelman 
406124c233c2SKris Buschelman         /* Third Column */
406224c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
406324c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
406424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
406524c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
406624c233c2SKris Buschelman 
406724c233c2SKris Buschelman         /* Fourth Column */
406824c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
406924c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
407024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
407124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
407224c233c2SKris Buschelman 
407324c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
407424c233c2SKris Buschelman       SSE_INLINE_END_3
407524c233c2SKris Buschelman 
407624c233c2SKris Buschelman       /* Promote solution from float to double */
407724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
407824c233c2SKris Buschelman 
407924c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
408024c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
408124c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
408224c233c2SKris Buschelman       idc  = 4*(*c--);
408324c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
408424c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
408524c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
408624c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
408724c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
408824c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
408924c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
409024c233c2SKris Buschelman       SSE_INLINE_END_2
409124c233c2SKris Buschelman       v    = aa + ai16 + 16;
409224c233c2SKris Buschelman       idt -= 4;
409324c233c2SKris Buschelman     }
409424c233c2SKris Buschelman 
409524c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
409624c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
40971ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
40981ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4099dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
410024c233c2SKris Buschelman   SSE_SCOPE_END;
410124c233c2SKris Buschelman   PetscFunctionReturn(0);
410224c233c2SKris Buschelman }
410324c233c2SKris Buschelman 
410424c233c2SKris Buschelman #endif
41050ef38995SBarry Smith 
41060ef38995SBarry Smith 
41074e2b4712SSatish Balay /*
41084e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
41094e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
41104e2b4712SSatish Balay */
41114a2ae208SSatish Balay #undef __FUNCT__
411206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
411306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
41144e2b4712SSatish Balay {
41154e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4116356650c2SBarry Smith   PetscInt          n=a->mbs;
4117356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
4118dfbe8321SBarry Smith   PetscErrorCode    ierr;
4119356650c2SBarry Smith   const PetscInt    *diag = a->diag;
4120d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
4121d9fead3dSBarry Smith   PetscScalar       *x;
4122d9fead3dSBarry Smith   const PetscScalar *b;
41234e2b4712SSatish Balay 
41244e2b4712SSatish Balay   PetscFunctionBegin;
41253649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
41261ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
41274e2b4712SSatish Balay 
4128aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
41292853dc0eSBarry Smith   {
413087828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41312853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
41322853dc0eSBarry Smith   }
4133aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
41342853dc0eSBarry Smith   {
413587828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41362853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
41372853dc0eSBarry Smith   }
4138aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
41392853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4140e1293385SBarry Smith #else
414130d4dcafSBarry Smith   {
414287828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4143d9fead3dSBarry Smith     const MatScalar *v;
4144356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
4145356650c2SBarry Smith     const PetscInt  *vi;
4146e1293385SBarry Smith 
41474e2b4712SSatish Balay   /* forward solve the lower triangular */
41484e2b4712SSatish Balay   idx    = 0;
4149e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
41504e2b4712SSatish Balay   for (i=1; i<n; i++) {
41514e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
41524e2b4712SSatish Balay     vi    =  aj      + ai[i];
41534e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
4154e1293385SBarry Smith     idx   +=  4;
4155f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
41564e2b4712SSatish Balay     while (nz--) {
41574e2b4712SSatish Balay       jdx   = 4*(*vi++);
41584e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4159f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4160f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4161f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4162f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
41634e2b4712SSatish Balay       v    += 16;
41644e2b4712SSatish Balay     }
4165f1af5d2fSBarry Smith     x[idx]   = s1;
4166f1af5d2fSBarry Smith     x[1+idx] = s2;
4167f1af5d2fSBarry Smith     x[2+idx] = s3;
4168f1af5d2fSBarry Smith     x[3+idx] = s4;
41694e2b4712SSatish Balay   }
41704e2b4712SSatish Balay   /* backward solve the upper triangular */
41714e555682SBarry Smith   idt = 4*(n-1);
41724e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
41734e555682SBarry Smith     ai16 = 16*diag[i];
41744e555682SBarry Smith     v    = aa + ai16 + 16;
41754e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
41764e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4177f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4178f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
41794e2b4712SSatish Balay     while (nz--) {
41804e2b4712SSatish Balay       idx   = 4*(*vi++);
41814e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4182f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4183f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4184f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4185f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
41864e2b4712SSatish Balay       v    += 16;
41874e2b4712SSatish Balay     }
41884e555682SBarry Smith     v        = aa + ai16;
4189f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4190f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4191f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4192f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4193329f5518SBarry Smith     idt -= 4;
41944e2b4712SSatish Balay   }
419530d4dcafSBarry Smith   }
4196e1293385SBarry Smith #endif
41974e2b4712SSatish Balay 
41983649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
41991ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4200dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
42014e2b4712SSatish Balay   PetscFunctionReturn(0);
42024e2b4712SSatish Balay }
42034e2b4712SSatish Balay 
4204b2b2dd24SShri Abhyankar #undef __FUNCT__
42054dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
42064dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4207b2b2dd24SShri Abhyankar {
4208b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4209b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4210b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
4211b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4212b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4213b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4214b2b2dd24SShri Abhyankar     PetscScalar       *x;
4215b2b2dd24SShri Abhyankar     const PetscScalar *b;
4216b2b2dd24SShri Abhyankar     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4217cee9d6f2SShri Abhyankar 
4218b2b2dd24SShri Abhyankar     PetscFunctionBegin;
42193649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4220b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4221b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4222b2b2dd24SShri Abhyankar     idx    = 0;
4223b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4224b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4225b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4226b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4227b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4228b2b2dd24SShri Abhyankar       idx   = bs*i;
4229b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4230b2b2dd24SShri Abhyankar       for (k=0;k<nz;k++) {
4231b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
4232b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4233b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4234b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4235b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4236b2b2dd24SShri Abhyankar           s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4237b2b2dd24SShri Abhyankar 
4238b2b2dd24SShri Abhyankar           v   +=  bs2;
4239b2b2dd24SShri Abhyankar         }
4240b2b2dd24SShri Abhyankar 
4241b2b2dd24SShri Abhyankar        x[idx]   = s1;
4242b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4243b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4244b2b2dd24SShri Abhyankar        x[3+idx] = s4;
4245b2b2dd24SShri Abhyankar     }
4246b2b2dd24SShri Abhyankar 
4247b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4248b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--) {
4249b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4250b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4251b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4252b2b2dd24SShri Abhyankar      idt = bs*i;
4253b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4254b2b2dd24SShri Abhyankar 
4255b2b2dd24SShri Abhyankar     for (k=0;k<nz;k++) {
4256b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
4257b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4258b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4259b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4260b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4261b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4262b2b2dd24SShri Abhyankar 
4263b2b2dd24SShri Abhyankar         v   +=  bs2;
4264b2b2dd24SShri Abhyankar     }
4265b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4266b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4267b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4268b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4269b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4270b2b2dd24SShri Abhyankar 
4271b2b2dd24SShri Abhyankar   }
4272b2b2dd24SShri Abhyankar 
42733649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4274b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4275b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4276b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4277b2b2dd24SShri Abhyankar }
4278cee9d6f2SShri Abhyankar 
4279cee9d6f2SShri Abhyankar #undef __FUNCT__
4280f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4281dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4282f26ec98cSKris Buschelman {
4283f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4284b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4285dfbe8321SBarry Smith   PetscErrorCode    ierr;
4286b3260449SShri Abhyankar   const MatScalar   *aa=a->a;
4287b3260449SShri Abhyankar   const PetscScalar *b;
4288b3260449SShri Abhyankar   PetscScalar       *x;
4289f26ec98cSKris Buschelman 
4290f26ec98cSKris Buschelman   PetscFunctionBegin;
42913649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
42921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4293f26ec98cSKris Buschelman 
4294f26ec98cSKris Buschelman   {
4295f26ec98cSKris Buschelman     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4296b3260449SShri Abhyankar     const MatScalar  *v;
4297b3260449SShri Abhyankar     MatScalar        *t=(MatScalar *)x;
4298b3260449SShri Abhyankar     PetscInt         jdx,idt,idx,nz,i,ai16;
4299b3260449SShri Abhyankar     const PetscInt   *vi;
4300f26ec98cSKris Buschelman 
4301f26ec98cSKris Buschelman     /* forward solve the lower triangular */
4302f26ec98cSKris Buschelman     idx  = 0;
4303f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
4304f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
4305f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
4306f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
4307f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
4308f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
4309f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
4310f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
4311f26ec98cSKris Buschelman       idx   +=  4;
4312f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
4313f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
4314f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
4315f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
4316f26ec98cSKris Buschelman       while (nz--) {
4317f26ec98cSKris Buschelman         jdx = 4*(*vi++);
4318f26ec98cSKris Buschelman         x1  = t[jdx];
4319f26ec98cSKris Buschelman         x2  = t[1+jdx];
4320f26ec98cSKris Buschelman         x3  = t[2+jdx];
4321f26ec98cSKris Buschelman         x4  = t[3+jdx];
4322f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4323f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4324f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4325f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4326f26ec98cSKris Buschelman         v    += 16;
4327f26ec98cSKris Buschelman       }
4328f26ec98cSKris Buschelman       t[idx]   = s1;
4329f26ec98cSKris Buschelman       t[1+idx] = s2;
4330f26ec98cSKris Buschelman       t[2+idx] = s3;
4331f26ec98cSKris Buschelman       t[3+idx] = s4;
4332f26ec98cSKris Buschelman     }
4333f26ec98cSKris Buschelman     /* backward solve the upper triangular */
4334f26ec98cSKris Buschelman     idt = 4*(n-1);
4335f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--) {
4336f26ec98cSKris Buschelman       ai16 = 16*diag[i];
4337f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
4338f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
4339f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
4340f26ec98cSKris Buschelman       s1   = t[idt];
4341f26ec98cSKris Buschelman       s2   = t[1+idt];
4342f26ec98cSKris Buschelman       s3   = t[2+idt];
4343f26ec98cSKris Buschelman       s4   = t[3+idt];
4344f26ec98cSKris Buschelman       while (nz--) {
4345f26ec98cSKris Buschelman         idx = 4*(*vi++);
4346f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
4347f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
4348f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
4349f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
4350f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4351f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4352f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4353f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4354f26ec98cSKris Buschelman         v    += 16;
4355f26ec98cSKris Buschelman       }
4356f26ec98cSKris Buschelman       v        = aa + ai16;
4357f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4358f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4359f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4360f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4361f26ec98cSKris Buschelman       idt -= 4;
4362f26ec98cSKris Buschelman     }
4363f26ec98cSKris Buschelman   }
4364f26ec98cSKris Buschelman 
43653649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
43661ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4367dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4368f26ec98cSKris Buschelman   PetscFunctionReturn(0);
4369f26ec98cSKris Buschelman }
4370f26ec98cSKris Buschelman 
43713660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
43723660e330SKris Buschelman 
43733660e330SKris Buschelman #include PETSC_HAVE_SSE
43743660e330SKris Buschelman #undef __FUNCT__
43757cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4376dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
43773660e330SKris Buschelman {
43783660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
43792aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
4380dfbe8321SBarry Smith   PetscErrorCode ierr;
4381dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
43823660e330SKris Buschelman   MatScalar      *aa=a->a;
438387828ca2SBarry Smith   PetscScalar    *x,*b;
43843660e330SKris Buschelman 
43853660e330SKris Buschelman   PetscFunctionBegin;
43863660e330SKris Buschelman   SSE_SCOPE_BEGIN;
43873660e330SKris Buschelman   /*
43883660e330SKris Buschelman      Note: This code currently uses demotion of double
43893660e330SKris Buschelman      to float when performing the mixed-mode computation.
43903660e330SKris Buschelman      This may not be numerically reasonable for all applications.
43913660e330SKris Buschelman   */
43923660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
43933660e330SKris Buschelman 
43941ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
43951ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
43963660e330SKris Buschelman   {
4397eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
4398eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
43992aa5897fSKris Buschelman     int            nz,i,idt,ai16;
44002aa5897fSKris Buschelman     unsigned int   jdx,idx;
44012aa5897fSKris Buschelman     unsigned short *vi;
4402eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
44033660e330SKris Buschelman 
4404eb05f457SKris Buschelman     /* First block is the identity. */
44053660e330SKris Buschelman     idx  = 0;
4406eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
44072aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
44083660e330SKris Buschelman 
44093660e330SKris Buschelman     for (i=1; i<n;) {
44103660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
44113660e330SKris Buschelman       vi   =  aj      + ai[i];
44123660e330SKris Buschelman       nz   =  diag[i] - ai[i];
44133660e330SKris Buschelman       idx +=  4;
44143660e330SKris Buschelman 
4415eb05f457SKris Buschelman       /* Demote RHS from double to float. */
4416eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4417eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
44183660e330SKris Buschelman 
44193660e330SKris Buschelman       while (nz--) {
44203660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
44212aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
44223660e330SKris Buschelman 
44233660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
4424eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
44253660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
44263660e330SKris Buschelman 
44273660e330SKris Buschelman           /* First Column */
44283660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
44293660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
44303660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
44313660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
44323660e330SKris Buschelman 
44333660e330SKris Buschelman           /* Second Column */
44343660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
44353660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
44363660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
44373660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
44383660e330SKris Buschelman 
44393660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
44403660e330SKris Buschelman 
44413660e330SKris Buschelman           /* Third Column */
44423660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
44433660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
44443660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
44453660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
44463660e330SKris Buschelman 
44473660e330SKris Buschelman           /* Fourth Column */
44483660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
44493660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
44503660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
44513660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
44523660e330SKris Buschelman         SSE_INLINE_END_2
44533660e330SKris Buschelman 
44543660e330SKris Buschelman         v  += 16;
44553660e330SKris Buschelman       }
44563660e330SKris Buschelman       v    =  aa + 16*ai[++i];
44573660e330SKris Buschelman       PREFETCH_NTA(v);
4458eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
44593660e330SKris Buschelman     }
4460eb05f457SKris Buschelman 
4461eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
4462eb05f457SKris Buschelman 
44633660e330SKris Buschelman     idt  = 4*(n-1);
44643660e330SKris Buschelman     ai16 = 16*diag[n-1];
44653660e330SKris Buschelman     v    = aa + ai16 + 16;
44663660e330SKris Buschelman     for (i=n-1; i>=0;) {
44673660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
44683660e330SKris Buschelman       vi = aj + diag[i] + 1;
44693660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
44703660e330SKris Buschelman 
4471eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
44723660e330SKris Buschelman 
44733660e330SKris Buschelman       while (nz--) {
44743660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
44752aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
44763660e330SKris Buschelman 
44773660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
4478eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
44793660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
44803660e330SKris Buschelman 
44813660e330SKris Buschelman           /* First Column */
44823660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
44833660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
44843660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
44853660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
44863660e330SKris Buschelman 
44873660e330SKris Buschelman           /* Second Column */
44883660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
44893660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
44903660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
44913660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
44923660e330SKris Buschelman 
44933660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
44943660e330SKris Buschelman 
44953660e330SKris Buschelman           /* Third Column */
44963660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
44973660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
44983660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
44993660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
45003660e330SKris Buschelman 
45013660e330SKris Buschelman           /* Fourth Column */
45023660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
45033660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
45043660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
45053660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
45063660e330SKris Buschelman         SSE_INLINE_END_2
45073660e330SKris Buschelman         v  += 16;
45083660e330SKris Buschelman       }
45093660e330SKris Buschelman       v    = aa + ai16;
45103660e330SKris Buschelman       ai16 = 16*diag[--i];
45113660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
45123660e330SKris Buschelman       /*
45133660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
45143660e330SKris Buschelman          which was inverted as part of the factorization
45153660e330SKris Buschelman       */
4516eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
45173660e330SKris Buschelman         /* First Column */
45183660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
45193660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
45203660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
45213660e330SKris Buschelman 
45223660e330SKris Buschelman         /* Second Column */
45233660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
45243660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
45253660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
45263660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
45273660e330SKris Buschelman 
45283660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
45293660e330SKris Buschelman 
45303660e330SKris Buschelman         /* Third Column */
45313660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
45323660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
45333660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
45343660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
45353660e330SKris Buschelman 
45363660e330SKris Buschelman         /* Fourth Column */
45373660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
45383660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
45393660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
45403660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
45413660e330SKris Buschelman 
45423660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
45433660e330SKris Buschelman       SSE_INLINE_END_3
45443660e330SKris Buschelman 
45453660e330SKris Buschelman       v    = aa + ai16 + 16;
45463660e330SKris Buschelman       idt -= 4;
45473660e330SKris Buschelman     }
4548eb05f457SKris Buschelman 
4549eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
4550eb05f457SKris Buschelman     idt = 4*(n-1);
4551eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
4552eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4553eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4554eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
4555eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
4556eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
4557eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
4558eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
4559eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
456054693613SKris Buschelman       idt -= 4;
45613660e330SKris Buschelman     }
4562eb05f457SKris Buschelman 
4563eb05f457SKris Buschelman   } /* End of artificial scope. */
45641ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
45651ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4566dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
45673660e330SKris Buschelman   SSE_SCOPE_END;
45683660e330SKris Buschelman   PetscFunctionReturn(0);
45693660e330SKris Buschelman }
45703660e330SKris Buschelman 
45717cf1b8d3SKris Buschelman #undef __FUNCT__
45727cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4573dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
45747cf1b8d3SKris Buschelman {
45757cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
45767cf1b8d3SKris Buschelman   int            *aj=a->j;
4577dfbe8321SBarry Smith   PetscErrorCode ierr;
4578dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
45797cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
45807cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
45817cf1b8d3SKris Buschelman 
45827cf1b8d3SKris Buschelman   PetscFunctionBegin;
45837cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
45847cf1b8d3SKris Buschelman   /*
45857cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
45867cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
45877cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
45887cf1b8d3SKris Buschelman   */
45897cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
45907cf1b8d3SKris Buschelman 
45911ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
45921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
45937cf1b8d3SKris Buschelman   {
45947cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
45957cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
45967cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
45977cf1b8d3SKris Buschelman     int       jdx,idx;
45987cf1b8d3SKris Buschelman     int       *vi;
45997cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
46007cf1b8d3SKris Buschelman 
46017cf1b8d3SKris Buschelman     /* First block is the identity. */
46027cf1b8d3SKris Buschelman     idx  = 0;
46037cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
46047cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
46057cf1b8d3SKris Buschelman 
46067cf1b8d3SKris Buschelman     for (i=1; i<n;) {
46077cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
46087cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
46097cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
46107cf1b8d3SKris Buschelman       idx +=  4;
46117cf1b8d3SKris Buschelman 
46127cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
46137cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
46147cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
46157cf1b8d3SKris Buschelman 
46167cf1b8d3SKris Buschelman       while (nz--) {
46177cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
46187cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
46197cf1b8d3SKris Buschelman /*          jdx = *vi++; */
46207cf1b8d3SKris Buschelman 
46217cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
46227cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
46237cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
46247cf1b8d3SKris Buschelman 
46257cf1b8d3SKris Buschelman           /* First Column */
46267cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
46277cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
46287cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
46297cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
46307cf1b8d3SKris Buschelman 
46317cf1b8d3SKris Buschelman           /* Second Column */
46327cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
46337cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
46347cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
46357cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
46367cf1b8d3SKris Buschelman 
46377cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
46387cf1b8d3SKris Buschelman 
46397cf1b8d3SKris Buschelman           /* Third Column */
46407cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
46417cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
46427cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
46437cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
46447cf1b8d3SKris Buschelman 
46457cf1b8d3SKris Buschelman           /* Fourth Column */
46467cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
46477cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
46487cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
46497cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
46507cf1b8d3SKris Buschelman         SSE_INLINE_END_2
46517cf1b8d3SKris Buschelman 
46527cf1b8d3SKris Buschelman         v  += 16;
46537cf1b8d3SKris Buschelman       }
46547cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
46557cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
46567cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
46577cf1b8d3SKris Buschelman     }
46587cf1b8d3SKris Buschelman 
46597cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
46607cf1b8d3SKris Buschelman 
46617cf1b8d3SKris Buschelman     idt  = 4*(n-1);
46627cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
46637cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
46647cf1b8d3SKris Buschelman     for (i=n-1; i>=0;) {
46657cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
46667cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
46677cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
46687cf1b8d3SKris Buschelman 
46697cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
46707cf1b8d3SKris Buschelman 
46717cf1b8d3SKris Buschelman       while (nz--) {
46727cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
46737cf1b8d3SKris Buschelman         idx = 4*(*vi++);
46747cf1b8d3SKris Buschelman /*          idx = *vi++; */
46757cf1b8d3SKris Buschelman 
46767cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
46777cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
46787cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
46797cf1b8d3SKris Buschelman 
46807cf1b8d3SKris Buschelman           /* First Column */
46817cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
46827cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
46837cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
46847cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
46857cf1b8d3SKris Buschelman 
46867cf1b8d3SKris Buschelman           /* Second Column */
46877cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
46887cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
46897cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
46907cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
46917cf1b8d3SKris Buschelman 
46927cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
46937cf1b8d3SKris Buschelman 
46947cf1b8d3SKris Buschelman           /* Third Column */
46957cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
46967cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
46977cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
46987cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
46997cf1b8d3SKris Buschelman 
47007cf1b8d3SKris Buschelman           /* Fourth Column */
47017cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
47027cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
47037cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
47047cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
47057cf1b8d3SKris Buschelman         SSE_INLINE_END_2
47067cf1b8d3SKris Buschelman         v  += 16;
47077cf1b8d3SKris Buschelman       }
47087cf1b8d3SKris Buschelman       v    = aa + ai16;
47097cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
47107cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
47117cf1b8d3SKris Buschelman       /*
47127cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
47137cf1b8d3SKris Buschelman          which was inverted as part of the factorization
47147cf1b8d3SKris Buschelman       */
47157cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
47167cf1b8d3SKris Buschelman         /* First Column */
47177cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
47187cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
47197cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
47207cf1b8d3SKris Buschelman 
47217cf1b8d3SKris Buschelman         /* Second Column */
47227cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
47237cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
47247cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
47257cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
47267cf1b8d3SKris Buschelman 
47277cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
47287cf1b8d3SKris Buschelman 
47297cf1b8d3SKris Buschelman         /* Third Column */
47307cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
47317cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
47327cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
47337cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
47347cf1b8d3SKris Buschelman 
47357cf1b8d3SKris Buschelman         /* Fourth Column */
47367cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
47377cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
47387cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
47397cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
47407cf1b8d3SKris Buschelman 
47417cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
47427cf1b8d3SKris Buschelman       SSE_INLINE_END_3
47437cf1b8d3SKris Buschelman 
47447cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
47457cf1b8d3SKris Buschelman       idt -= 4;
47467cf1b8d3SKris Buschelman     }
47477cf1b8d3SKris Buschelman 
47487cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
47497cf1b8d3SKris Buschelman     idt = 4*(n-1);
47507cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
47517cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
47527cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
47537cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
47547cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
47557cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
47567cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
47577cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
47587cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
47597cf1b8d3SKris Buschelman       idt -= 4;
47607cf1b8d3SKris Buschelman     }
47617cf1b8d3SKris Buschelman 
47627cf1b8d3SKris Buschelman   } /* End of artificial scope. */
47631ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
47641ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4765dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
47667cf1b8d3SKris Buschelman   SSE_SCOPE_END;
47677cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
47687cf1b8d3SKris Buschelman }
47697cf1b8d3SKris Buschelman 
47703660e330SKris Buschelman #endif
47718f690400SShri Abhyankar 
47724a2ae208SSatish Balay #undef __FUNCT__
477306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
477406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
47754e2b4712SSatish Balay {
47764e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
47774e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
47786849ba73SBarry Smith   PetscErrorCode    ierr;
4779b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4780b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
47815d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4782d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4783d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4784d9fead3dSBarry Smith   const PetscScalar *b;
47854e2b4712SSatish Balay 
47864e2b4712SSatish Balay   PetscFunctionBegin;
47873649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
47881ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4789f1af5d2fSBarry Smith   t  = a->solve_work;
47904e2b4712SSatish Balay 
47914e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
47924e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
47934e2b4712SSatish Balay 
47944e2b4712SSatish Balay   /* forward solve the lower triangular */
47954e2b4712SSatish Balay   idx    = 3*(*r++);
4796f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
47974e2b4712SSatish Balay   for (i=1; i<n; i++) {
47984e2b4712SSatish Balay     v     = aa + 9*ai[i];
47994e2b4712SSatish Balay     vi    = aj + ai[i];
48004e2b4712SSatish Balay     nz    = diag[i] - ai[i];
48014e2b4712SSatish Balay     idx   = 3*(*r++);
4802f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
48034e2b4712SSatish Balay     while (nz--) {
48044e2b4712SSatish Balay       idx   = 3*(*vi++);
4805f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4806f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4807f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4808f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48094e2b4712SSatish Balay       v += 9;
48104e2b4712SSatish Balay     }
48114e2b4712SSatish Balay     idx = 3*i;
4812f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
48134e2b4712SSatish Balay   }
48144e2b4712SSatish Balay   /* backward solve the upper triangular */
48154e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
48164e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
48174e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
48184e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
48194e2b4712SSatish Balay     idt  = 3*i;
4820f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
48214e2b4712SSatish Balay     while (nz--) {
48224e2b4712SSatish Balay       idx   = 3*(*vi++);
4823f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4824f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4825f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4826f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48274e2b4712SSatish Balay       v += 9;
48284e2b4712SSatish Balay     }
48294e2b4712SSatish Balay     idc = 3*(*c--);
48304e2b4712SSatish Balay     v   = aa + 9*diag[i];
4831f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4832f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4833f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
48344e2b4712SSatish Balay   }
48354e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
48364e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
48373649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
48381ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4839dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
48404e2b4712SSatish Balay   PetscFunctionReturn(0);
48414e2b4712SSatish Balay }
48424e2b4712SSatish Balay 
48430c4413a7SShri Abhyankar #undef __FUNCT__
48444dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3"
48454dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
48460c4413a7SShri Abhyankar {
48470c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
48480c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
48490c4413a7SShri Abhyankar   PetscErrorCode    ierr;
4850b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4851b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
48520c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
48530c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
48540c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
48550c4413a7SShri Abhyankar   const PetscScalar *b;
48560c4413a7SShri Abhyankar 
48570c4413a7SShri Abhyankar   PetscFunctionBegin;
48583649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
48590c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
48600c4413a7SShri Abhyankar   t  = a->solve_work;
48610c4413a7SShri Abhyankar 
48620c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
48630c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
48640c4413a7SShri Abhyankar 
48650c4413a7SShri Abhyankar   /* forward solve the lower triangular */
48660c4413a7SShri Abhyankar   idx    = 3*r[0];
48670c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
48680c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
48690c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
48700c4413a7SShri Abhyankar     vi    = aj + ai[i];
48710c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
48720c4413a7SShri Abhyankar     idx   = 3*r[i];
48730c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
48740c4413a7SShri Abhyankar     for (m=0;m<nz;m++) {
48750c4413a7SShri Abhyankar       idx   = 3*vi[m];
48760c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
48770c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
48780c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
48790c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48800c4413a7SShri Abhyankar       v += 9;
48810c4413a7SShri Abhyankar     }
48820c4413a7SShri Abhyankar     idx = 3*i;
48830c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
48840c4413a7SShri Abhyankar   }
48850c4413a7SShri Abhyankar   /* backward solve the upper triangular */
48860c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--) {
48870c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
48880c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
48890c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
48900c4413a7SShri Abhyankar     idt  = 3*i;
48910c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
48920c4413a7SShri Abhyankar     for (m=0;m<nz;m++) {
48930c4413a7SShri Abhyankar       idx   = 3*vi[m];
48940c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
48950c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
48960c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
48970c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48980c4413a7SShri Abhyankar       v += 9;
48990c4413a7SShri Abhyankar     }
49000c4413a7SShri Abhyankar     idc = 3*c[i];
49010c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
49020c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
49030c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
49040c4413a7SShri Abhyankar   }
49050c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
49060c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
49073649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
49080c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
49090c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
49100c4413a7SShri Abhyankar   PetscFunctionReturn(0);
49110c4413a7SShri Abhyankar }
49120c4413a7SShri Abhyankar 
491315091d37SBarry Smith /*
491415091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
491515091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
491615091d37SBarry Smith */
49174a2ae208SSatish Balay #undef __FUNCT__
491806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
491906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
492015091d37SBarry Smith {
492115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
49220b68f018SBarry Smith   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4923dfbe8321SBarry Smith   PetscErrorCode    ierr;
49240b68f018SBarry Smith   const PetscInt    *diag = a->diag,*vi;
4925d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4926d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4927d9fead3dSBarry Smith   const PetscScalar *b;
49280b68f018SBarry Smith   PetscInt          jdx,idt,idx,nz,i;
492915091d37SBarry Smith 
493015091d37SBarry Smith   PetscFunctionBegin;
49313649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
49321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
493315091d37SBarry Smith 
493415091d37SBarry Smith   /* forward solve the lower triangular */
493515091d37SBarry Smith   idx    = 0;
493615091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
493715091d37SBarry Smith   for (i=1; i<n; i++) {
493815091d37SBarry Smith     v     =  aa      + 9*ai[i];
493915091d37SBarry Smith     vi    =  aj      + ai[i];
494015091d37SBarry Smith     nz    =  diag[i] - ai[i];
494115091d37SBarry Smith     idx   +=  3;
4942f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
494315091d37SBarry Smith     while (nz--) {
494415091d37SBarry Smith       jdx   = 3*(*vi++);
494515091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4946f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4947f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4948f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
494915091d37SBarry Smith       v    += 9;
495015091d37SBarry Smith     }
4951f1af5d2fSBarry Smith     x[idx]   = s1;
4952f1af5d2fSBarry Smith     x[1+idx] = s2;
4953f1af5d2fSBarry Smith     x[2+idx] = s3;
495415091d37SBarry Smith   }
495515091d37SBarry Smith   /* backward solve the upper triangular */
495615091d37SBarry Smith   for (i=n-1; i>=0; i--) {
495715091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
495815091d37SBarry Smith     vi   = aj + diag[i] + 1;
495915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
496015091d37SBarry Smith     idt  = 3*i;
4961f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4962f1af5d2fSBarry Smith     s3 = x[2+idt];
496315091d37SBarry Smith     while (nz--) {
496415091d37SBarry Smith       idx   = 3*(*vi++);
496515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4966f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4967f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4968f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
496915091d37SBarry Smith       v    += 9;
497015091d37SBarry Smith     }
497115091d37SBarry Smith     v        = aa +  9*diag[i];
4972f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4973f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4974f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
497515091d37SBarry Smith   }
497615091d37SBarry Smith 
49773649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
49781ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4979dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
498015091d37SBarry Smith   PetscFunctionReturn(0);
498115091d37SBarry Smith }
498215091d37SBarry Smith 
4983cee9d6f2SShri Abhyankar #undef __FUNCT__
49844dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
49854dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4986b2b2dd24SShri Abhyankar {
4987b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4988b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4989b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4990b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
4991b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4992b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4993b2b2dd24SShri Abhyankar     PetscScalar       *x;
4994b2b2dd24SShri Abhyankar     const PetscScalar *b;
4995b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4996b2b2dd24SShri Abhyankar 
4997b2b2dd24SShri Abhyankar     PetscFunctionBegin;
49983649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4999b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5000b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
5001b2b2dd24SShri Abhyankar     idx    = 0;
5002b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5003b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
5004b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
5005b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
5006b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
5007b2b2dd24SShri Abhyankar       idx   = bs*i;
5008b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5009b2b2dd24SShri Abhyankar       for (k=0;k<nz;k++) {
5010b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
5011b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5012b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5013b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5014b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5015b2b2dd24SShri Abhyankar 
5016b2b2dd24SShri Abhyankar           v   +=  bs2;
5017b2b2dd24SShri Abhyankar         }
5018b2b2dd24SShri Abhyankar 
5019b2b2dd24SShri Abhyankar        x[idx]   = s1;
5020b2b2dd24SShri Abhyankar        x[1+idx] = s2;
5021b2b2dd24SShri Abhyankar        x[2+idx] = s3;
5022b2b2dd24SShri Abhyankar     }
5023b2b2dd24SShri Abhyankar 
5024b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
5025b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--) {
5026b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
5027b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
5028b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
5029b2b2dd24SShri Abhyankar      idt = bs*i;
5030b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5031b2b2dd24SShri Abhyankar 
5032b2b2dd24SShri Abhyankar      for (k=0;k<nz;k++) {
5033b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
5034b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5035b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5036b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5037b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5038b2b2dd24SShri Abhyankar 
5039b2b2dd24SShri Abhyankar         v   +=  bs2;
5040b2b2dd24SShri Abhyankar     }
5041b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5042b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5043b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5044b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5045b2b2dd24SShri Abhyankar 
5046b2b2dd24SShri Abhyankar   }
5047b2b2dd24SShri Abhyankar 
50483649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5049b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5050b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5051b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5052b2b2dd24SShri Abhyankar }
5053b2b2dd24SShri Abhyankar 
5054b2b2dd24SShri Abhyankar #undef __FUNCT__
505506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
505606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
50574e2b4712SSatish Balay {
50584e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
50594e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
50606849ba73SBarry Smith   PetscErrorCode    ierr;
5061b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5062b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
50635d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5064d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5065d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
5066d9fead3dSBarry Smith   const PetscScalar *b;
50674e2b4712SSatish Balay 
50684e2b4712SSatish Balay   PetscFunctionBegin;
50693649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
50701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5071f1af5d2fSBarry Smith   t  = a->solve_work;
50724e2b4712SSatish Balay 
50734e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
50744e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
50754e2b4712SSatish Balay 
50764e2b4712SSatish Balay   /* forward solve the lower triangular */
50774e2b4712SSatish Balay   idx    = 2*(*r++);
5078f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
50794e2b4712SSatish Balay   for (i=1; i<n; i++) {
50804e2b4712SSatish Balay     v     = aa + 4*ai[i];
50814e2b4712SSatish Balay     vi    = aj + ai[i];
50824e2b4712SSatish Balay     nz    = diag[i] - ai[i];
50834e2b4712SSatish Balay     idx   = 2*(*r++);
5084f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
50854e2b4712SSatish Balay     while (nz--) {
50864e2b4712SSatish Balay       idx   = 2*(*vi++);
5087f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5088f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5089f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
50904e2b4712SSatish Balay       v += 4;
50914e2b4712SSatish Balay     }
50924e2b4712SSatish Balay     idx = 2*i;
5093f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
50944e2b4712SSatish Balay   }
50954e2b4712SSatish Balay   /* backward solve the upper triangular */
50964e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
50974e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
50984e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
50994e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
51004e2b4712SSatish Balay     idt  = 2*i;
5101f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
51024e2b4712SSatish Balay     while (nz--) {
51034e2b4712SSatish Balay       idx   = 2*(*vi++);
5104f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5105f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5106f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
51074e2b4712SSatish Balay       v += 4;
51084e2b4712SSatish Balay     }
51094e2b4712SSatish Balay     idc = 2*(*c--);
51104e2b4712SSatish Balay     v   = aa + 4*diag[i];
5111f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5112f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
51134e2b4712SSatish Balay   }
51144e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
51154e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
51163649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
51171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5118dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
51194e2b4712SSatish Balay   PetscFunctionReturn(0);
51204e2b4712SSatish Balay }
51214e2b4712SSatish Balay 
51220c4413a7SShri Abhyankar #undef __FUNCT__
51234dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2"
51244dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
51250c4413a7SShri Abhyankar {
51260c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
51270c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
51280c4413a7SShri Abhyankar   PetscErrorCode    ierr;
5129b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5130b3260449SShri Abhyankar   PetscInt          i,nz,idx,jdx,idt,idc,m;
51310c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
51320c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
51330c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
51340c4413a7SShri Abhyankar   const PetscScalar *b;
51350c4413a7SShri Abhyankar 
51360c4413a7SShri Abhyankar   PetscFunctionBegin;
51373649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
51380c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
51390c4413a7SShri Abhyankar   t  = a->solve_work;
51400c4413a7SShri Abhyankar 
51410c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
51420c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
51430c4413a7SShri Abhyankar 
51440c4413a7SShri Abhyankar   /* forward solve the lower triangular */
51450c4413a7SShri Abhyankar   idx    = 2*r[0];
51460c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
51470c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
51480c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
51490c4413a7SShri Abhyankar     vi    = aj + ai[i];
51500c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
51510c4413a7SShri Abhyankar     idx   = 2*r[i];
51520c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
51530c4413a7SShri Abhyankar     for (m=0;m<nz;m++) {
51540c4413a7SShri Abhyankar       jdx   = 2*vi[m];
51550c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
51560c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
51570c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
51580c4413a7SShri Abhyankar       v += 4;
51590c4413a7SShri Abhyankar     }
51600c4413a7SShri Abhyankar     idx = 2*i;
51610c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
51620c4413a7SShri Abhyankar   }
51630c4413a7SShri Abhyankar   /* backward solve the upper triangular */
51640c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--) {
51650c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
51660c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
51670c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
51680c4413a7SShri Abhyankar     idt  = 2*i;
51690c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
51700c4413a7SShri Abhyankar     for (m=0;m<nz;m++) {
51710c4413a7SShri Abhyankar       idx   = 2*vi[m];
51720c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
51730c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
51740c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
51750c4413a7SShri Abhyankar       v += 4;
51760c4413a7SShri Abhyankar     }
51770c4413a7SShri Abhyankar     idc = 2*c[i];
51780c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
51790c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
51800c4413a7SShri Abhyankar   }
51810c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
51820c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
51833649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
51840c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51850c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
51860c4413a7SShri Abhyankar   PetscFunctionReturn(0);
51870c4413a7SShri Abhyankar }
51888f690400SShri Abhyankar 
518915091d37SBarry Smith /*
519015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
519115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
519215091d37SBarry Smith */
51934a2ae208SSatish Balay #undef __FUNCT__
519406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
519506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
519615091d37SBarry Smith {
519715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5198b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5199dfbe8321SBarry Smith   PetscErrorCode    ierr;
5200d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5201d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
5202d9fead3dSBarry Smith   const PetscScalar *b;
5203b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
520415091d37SBarry Smith 
520515091d37SBarry Smith   PetscFunctionBegin;
52063649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
52071ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
520815091d37SBarry Smith 
520915091d37SBarry Smith   /* forward solve the lower triangular */
521015091d37SBarry Smith   idx    = 0;
521115091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
521215091d37SBarry Smith   for (i=1; i<n; i++) {
521315091d37SBarry Smith     v     =  aa      + 4*ai[i];
521415091d37SBarry Smith     vi    =  aj      + ai[i];
521515091d37SBarry Smith     nz    =  diag[i] - ai[i];
521615091d37SBarry Smith     idx   +=  2;
5217f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
521815091d37SBarry Smith     while (nz--) {
521915091d37SBarry Smith       jdx   = 2*(*vi++);
522015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
5221f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5222f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
522315091d37SBarry Smith       v    += 4;
522415091d37SBarry Smith     }
5225f1af5d2fSBarry Smith     x[idx]   = s1;
5226f1af5d2fSBarry Smith     x[1+idx] = s2;
522715091d37SBarry Smith   }
522815091d37SBarry Smith   /* backward solve the upper triangular */
522915091d37SBarry Smith   for (i=n-1; i>=0; i--) {
523015091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
523115091d37SBarry Smith     vi   = aj + diag[i] + 1;
523215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
523315091d37SBarry Smith     idt  = 2*i;
5234f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
523515091d37SBarry Smith     while (nz--) {
523615091d37SBarry Smith       idx   = 2*(*vi++);
523715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
5238f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5239f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
524015091d37SBarry Smith       v    += 4;
524115091d37SBarry Smith     }
524215091d37SBarry Smith     v        = aa +  4*diag[i];
5243f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
5244f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
524515091d37SBarry Smith   }
524615091d37SBarry Smith 
52473649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
52481ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5249dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
525015091d37SBarry Smith   PetscFunctionReturn(0);
525115091d37SBarry Smith }
525215091d37SBarry Smith 
5253cee9d6f2SShri Abhyankar #undef __FUNCT__
52544dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
52554dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5256b2b2dd24SShri Abhyankar {
5257b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5258b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5259b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,idt,jdx;
5260b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
5261b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
5262b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
5263b2b2dd24SShri Abhyankar     const PetscScalar *b;
5264b2b2dd24SShri Abhyankar 
5265b2b2dd24SShri Abhyankar     PetscFunctionBegin;
52663649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5267b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5268b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
5269b2b2dd24SShri Abhyankar     idx    = 0;
5270b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
5271b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
5272b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
5273b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
5274b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
5275b2b2dd24SShri Abhyankar        idx  = 2*i;
5276b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
52774c0dbd8dSJed Brown        PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
52784c0dbd8dSJed Brown        PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5279b2b2dd24SShri Abhyankar       for (k=0;k<nz;k++) {
5280b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
5281b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
5282b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
5283b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
5284b2b2dd24SShri Abhyankar            v   +=  4;
5285b2b2dd24SShri Abhyankar         }
5286b2b2dd24SShri Abhyankar        x[idx]   = s1;
5287b2b2dd24SShri Abhyankar        x[1+idx] = s2;
5288b2b2dd24SShri Abhyankar     }
5289b2b2dd24SShri Abhyankar 
5290b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
5291b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--) {
5292b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
5293b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
5294b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
5295b2b2dd24SShri Abhyankar      idt = 2*i;
5296b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
52974c0dbd8dSJed Brown      PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
52984c0dbd8dSJed Brown      PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5299b2b2dd24SShri Abhyankar      for (k=0;k<nz;k++) {
5300b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
5301b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
5302b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
5303b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
5304b2b2dd24SShri Abhyankar          v    += 4;
5305b2b2dd24SShri Abhyankar     }
5306b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5307b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
5308b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
5309b2b2dd24SShri Abhyankar   }
5310b2b2dd24SShri Abhyankar 
53113649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5312b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5313b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5314b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5315b2b2dd24SShri Abhyankar }
5316b2b2dd24SShri Abhyankar 
5317b2b2dd24SShri Abhyankar #undef __FUNCT__
531806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
531906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
53204e2b4712SSatish Balay {
53214e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
53224e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
53236849ba73SBarry Smith   PetscErrorCode    ierr;
5324b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5325b3260449SShri Abhyankar   PetscInt          i,nz;
53265d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5327b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5328b3260449SShri Abhyankar   PetscScalar       *x,s1,*t;
5329b3260449SShri Abhyankar   const PetscScalar *b;
53304e2b4712SSatish Balay 
53314e2b4712SSatish Balay   PetscFunctionBegin;
53324e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
53334e2b4712SSatish Balay 
53343649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
53351ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5336f1af5d2fSBarry Smith   t  = a->solve_work;
53374e2b4712SSatish Balay 
53384e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
53394e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
53404e2b4712SSatish Balay 
53414e2b4712SSatish Balay   /* forward solve the lower triangular */
5342f1af5d2fSBarry Smith   t[0] = b[*r++];
53434e2b4712SSatish Balay   for (i=1; i<n; i++) {
53444e2b4712SSatish Balay     v     = aa + ai[i];
53454e2b4712SSatish Balay     vi    = aj + ai[i];
53464e2b4712SSatish Balay     nz    = diag[i] - ai[i];
5347f1af5d2fSBarry Smith     s1  = b[*r++];
53484e2b4712SSatish Balay     while (nz--) {
5349f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53504e2b4712SSatish Balay     }
5351f1af5d2fSBarry Smith     t[i] = s1;
53524e2b4712SSatish Balay   }
53534e2b4712SSatish Balay   /* backward solve the upper triangular */
53544e2b4712SSatish Balay   for (i=n-1; i>=0; i--) {
53554e2b4712SSatish Balay     v    = aa + diag[i] + 1;
53564e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
53574e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5358f1af5d2fSBarry Smith     s1 = t[i];
53594e2b4712SSatish Balay     while (nz--) {
5360f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53614e2b4712SSatish Balay     }
5362f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
53634e2b4712SSatish Balay   }
53644e2b4712SSatish Balay 
53654e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
53664e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
53673649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
53681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5369dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53704e2b4712SSatish Balay   PetscFunctionReturn(0);
53714e2b4712SSatish Balay }
5372048b5e81SShri Abhyankar 
5373048b5e81SShri Abhyankar #undef __FUNCT__
5374048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5375048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5376048b5e81SShri Abhyankar {
5377048b5e81SShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5378048b5e81SShri Abhyankar   IS                iscol = a->col,isrow = a->row;
5379048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5380048b5e81SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5381048b5e81SShri Abhyankar   const PetscInt    *rout,*cout,*r,*c;
5382048b5e81SShri Abhyankar   PetscScalar       *x,*tmp,sum;
5383048b5e81SShri Abhyankar   const PetscScalar *b;
5384048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5385048b5e81SShri Abhyankar 
5386048b5e81SShri Abhyankar   PetscFunctionBegin;
5387048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5388048b5e81SShri Abhyankar 
53893649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5390048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5391048b5e81SShri Abhyankar   tmp  = a->solve_work;
5392048b5e81SShri Abhyankar 
5393048b5e81SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5394048b5e81SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5395048b5e81SShri Abhyankar 
5396048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5397048b5e81SShri Abhyankar   tmp[0] = b[r[0]];
5398048b5e81SShri Abhyankar   v      = aa;
5399048b5e81SShri Abhyankar   vi     = aj;
5400048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5401048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5402048b5e81SShri Abhyankar     sum = b[r[i]];
5403048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5404048b5e81SShri Abhyankar     tmp[i] = sum;
5405048b5e81SShri Abhyankar     v += nz; vi += nz;
5406048b5e81SShri Abhyankar   }
5407048b5e81SShri Abhyankar 
5408048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5409048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--) {
5410048b5e81SShri Abhyankar     v   = aa + adiag[i+1]+1;
5411048b5e81SShri Abhyankar     vi  = aj + adiag[i+1]+1;
5412048b5e81SShri Abhyankar     nz  = adiag[i]-adiag[i+1]-1;
5413048b5e81SShri Abhyankar     sum = tmp[i];
5414048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5415048b5e81SShri Abhyankar     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5416048b5e81SShri Abhyankar   }
5417048b5e81SShri Abhyankar 
5418048b5e81SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5419048b5e81SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
54203649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5421048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5422048b5e81SShri Abhyankar   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5423048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5424048b5e81SShri Abhyankar }
5425048b5e81SShri Abhyankar 
542615091d37SBarry Smith /*
542715091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
542815091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
542915091d37SBarry Smith */
54304a2ae208SSatish Balay #undef __FUNCT__
543106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
543206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
543315091d37SBarry Smith {
543415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5435b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5436dfbe8321SBarry Smith   PetscErrorCode    ierr;
5437b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5438b3260449SShri Abhyankar   PetscScalar       *x;
5439b3260449SShri Abhyankar   const PetscScalar *b;
544087828ca2SBarry Smith   PetscScalar       s1,x1;
5441b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
544215091d37SBarry Smith 
544315091d37SBarry Smith   PetscFunctionBegin;
54443649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
54451ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544615091d37SBarry Smith 
544715091d37SBarry Smith   /* forward solve the lower triangular */
544815091d37SBarry Smith   idx    = 0;
544915091d37SBarry Smith   x[0]   = b[0];
545015091d37SBarry Smith   for (i=1; i<n; i++) {
545115091d37SBarry Smith     v     =  aa      + ai[i];
545215091d37SBarry Smith     vi    =  aj      + ai[i];
545315091d37SBarry Smith     nz    =  diag[i] - ai[i];
545415091d37SBarry Smith     idx   +=  1;
5455f1af5d2fSBarry Smith     s1  =  b[idx];
545615091d37SBarry Smith     while (nz--) {
545715091d37SBarry Smith       jdx   = *vi++;
545815091d37SBarry Smith       x1    = x[jdx];
5459f1af5d2fSBarry Smith       s1 -= v[0]*x1;
546015091d37SBarry Smith       v    += 1;
546115091d37SBarry Smith     }
5462f1af5d2fSBarry Smith     x[idx]   = s1;
546315091d37SBarry Smith   }
546415091d37SBarry Smith   /* backward solve the upper triangular */
546515091d37SBarry Smith   for (i=n-1; i>=0; i--) {
546615091d37SBarry Smith     v    = aa + diag[i] + 1;
546715091d37SBarry Smith     vi   = aj + diag[i] + 1;
546815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
546915091d37SBarry Smith     idt  = i;
5470f1af5d2fSBarry Smith     s1 = x[idt];
547115091d37SBarry Smith     while (nz--) {
547215091d37SBarry Smith       idx   = *vi++;
547315091d37SBarry Smith       x1    = x[idx];
5474f1af5d2fSBarry Smith       s1 -= v[0]*x1;
547515091d37SBarry Smith       v    += 1;
547615091d37SBarry Smith     }
547715091d37SBarry Smith     v        = aa +  diag[i];
5478f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
547915091d37SBarry Smith   }
54803649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
54811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5482dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
548315091d37SBarry Smith   PetscFunctionReturn(0);
548415091d37SBarry Smith }
54854e2b4712SSatish Balay 
5486048b5e81SShri Abhyankar 
5487048b5e81SShri Abhyankar #undef __FUNCT__
5488048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5489048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5490048b5e81SShri Abhyankar {
5491048b5e81SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5492048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5493048b5e81SShri Abhyankar   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5494048b5e81SShri Abhyankar   PetscScalar       *x,sum;
5495048b5e81SShri Abhyankar   const PetscScalar *b;
5496048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5497048b5e81SShri Abhyankar   PetscInt          i,nz;
5498048b5e81SShri Abhyankar 
5499048b5e81SShri Abhyankar   PetscFunctionBegin;
5500048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5501048b5e81SShri Abhyankar 
55023649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5503048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5504048b5e81SShri Abhyankar 
5505048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5506048b5e81SShri Abhyankar   x[0] = b[0];
5507048b5e81SShri Abhyankar   v    = aa;
5508048b5e81SShri Abhyankar   vi   = aj;
5509048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5510048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5511048b5e81SShri Abhyankar     sum = b[i];
5512048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5513048b5e81SShri Abhyankar     v  += nz;
5514048b5e81SShri Abhyankar     vi += nz;
5515048b5e81SShri Abhyankar     x[i] = sum;
5516048b5e81SShri Abhyankar   }
5517048b5e81SShri Abhyankar 
5518048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5519048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--) {
5520048b5e81SShri Abhyankar     v   = aa + adiag[i+1] + 1;
5521048b5e81SShri Abhyankar     vi  = aj + adiag[i+1] + 1;
5522048b5e81SShri Abhyankar     nz = adiag[i] - adiag[i+1]-1;
5523048b5e81SShri Abhyankar     sum = x[i];
5524048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5525048b5e81SShri Abhyankar     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5526048b5e81SShri Abhyankar   }
5527048b5e81SShri Abhyankar 
5528048b5e81SShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
55293649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5530048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5531048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5532048b5e81SShri Abhyankar }
5533048b5e81SShri Abhyankar 
55344e2b4712SSatish Balay /* ----------------------------------------------------------------*/
553509573ac7SBarry Smith extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool);
55366bce7ff8SHong Zhang 
55372b0b2ea7SShri Abhyankar #undef __FUNCT__
553829a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5539766f9fbaSBarry Smith /*
5540766f9fbaSBarry Smith    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5541766f9fbaSBarry Smith */
554229a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
55432b0b2ea7SShri Abhyankar {
55442b0b2ea7SShri Abhyankar   Mat             C=B;
55452b0b2ea7SShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
55462b0b2ea7SShri Abhyankar   PetscErrorCode  ierr;
5547766f9fbaSBarry Smith   PetscInt        i,j,k,ipvt[15];
5548766f9fbaSBarry Smith   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5549766f9fbaSBarry Smith   PetscInt        nz,nzL,row;
5550766f9fbaSBarry Smith   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5551766f9fbaSBarry Smith   const MatScalar *v,*aa=a->a;
55522b0b2ea7SShri Abhyankar   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
55530fa040f9SShri Abhyankar   PetscInt        sol_ver;
55542b0b2ea7SShri Abhyankar 
55552b0b2ea7SShri Abhyankar   PetscFunctionBegin;
5556c55dd799SBarry Smith   ierr = PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
55570fa040f9SShri Abhyankar 
55582b0b2ea7SShri Abhyankar   /* generate work space needed by the factorization */
55592b0b2ea7SShri Abhyankar   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
55602b0b2ea7SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
55612b0b2ea7SShri Abhyankar 
55622b0b2ea7SShri Abhyankar   for (i=0; i<n; i++) {
55632b0b2ea7SShri Abhyankar     /* zero rtmp */
55642b0b2ea7SShri Abhyankar     /* L part */
55652b0b2ea7SShri Abhyankar     nz    = bi[i+1] - bi[i];
55662b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
55672b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++) {
55682b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
55692b0b2ea7SShri Abhyankar     }
55702b0b2ea7SShri Abhyankar 
55712b0b2ea7SShri Abhyankar     /* U part */
55722b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
55732b0b2ea7SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
55742b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++) {
55752b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
55762b0b2ea7SShri Abhyankar     }
55772b0b2ea7SShri Abhyankar 
55782b0b2ea7SShri Abhyankar     /* load in initial (unfactored row) */
557929a97285SShri Abhyankar     nz    = ai[i+1] - ai[i];
558029a97285SShri Abhyankar     ajtmp = aj + ai[i];
558129a97285SShri Abhyankar     v     = aa + bs2*ai[i];
55822b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
558329a97285SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
55842b0b2ea7SShri Abhyankar     }
55852b0b2ea7SShri Abhyankar 
55862b0b2ea7SShri Abhyankar     /* elimination */
55872b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
55882b0b2ea7SShri Abhyankar     nzL   = bi[i+1] - bi[i];
55892b0b2ea7SShri Abhyankar     for (k=0;k < nzL;k++) {
55902b0b2ea7SShri Abhyankar       row = bjtmp[k];
55912b0b2ea7SShri Abhyankar       pc = rtmp + bs2*row;
5592c35f09e5SBarry Smith       for (flg=0,j=0; j<bs2; j++) {
5593c35f09e5SBarry Smith         if (pc[j]!=0.0) {
5594c35f09e5SBarry Smith           flg = 1;
5595c35f09e5SBarry Smith           break;
5596c35f09e5SBarry Smith         }
5597c35f09e5SBarry Smith       }
55982b0b2ea7SShri Abhyankar       if (flg) {
55992b0b2ea7SShri Abhyankar         pv = b->a + bs2*bdiag[row];
560096b95a6bSBarry Smith         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
560196b95a6bSBarry Smith         /*ierr = PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
56022b0b2ea7SShri Abhyankar         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
56032b0b2ea7SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
56042b0b2ea7SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
56052b0b2ea7SShri Abhyankar         for (j=0; j<nz; j++) {
5606766f9fbaSBarry Smith           vv   = rtmp + bs2*pj[j];
560796b95a6bSBarry Smith           PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
560896b95a6bSBarry Smith           /* ierr = PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
56092b0b2ea7SShri Abhyankar           pv  += bs2;
56102b0b2ea7SShri Abhyankar         }
5611766f9fbaSBarry Smith         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
56122b0b2ea7SShri Abhyankar       }
56132b0b2ea7SShri Abhyankar     }
56142b0b2ea7SShri Abhyankar 
56152b0b2ea7SShri Abhyankar     /* finished row so stick it into b->a */
56162b0b2ea7SShri Abhyankar     /* L part */
56172b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
56182b0b2ea7SShri Abhyankar     pj   = b->j + bi[i] ;
56192b0b2ea7SShri Abhyankar     nz   = bi[i+1] - bi[i];
56202b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
56212b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56222b0b2ea7SShri Abhyankar     }
56232b0b2ea7SShri Abhyankar 
56242b0b2ea7SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
56252b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
56262b0b2ea7SShri Abhyankar     pj   = b->j + bdiag[i];
56272b0b2ea7SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
562896b95a6bSBarry Smith     /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */
562996b95a6bSBarry Smith     ierr = PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
56302b0b2ea7SShri Abhyankar 
56312b0b2ea7SShri Abhyankar     /* U part */
56322b0b2ea7SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
56332b0b2ea7SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
56342b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
56352b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
56362b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56372b0b2ea7SShri Abhyankar     }
56382b0b2ea7SShri Abhyankar   }
56392b0b2ea7SShri Abhyankar 
56402b0b2ea7SShri Abhyankar   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5641832cc040SShri Abhyankar   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5642766f9fbaSBarry Smith   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
56432b0b2ea7SShri Abhyankar   C->assembled = PETSC_TRUE;
5644766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
56452b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
56462b0b2ea7SShri Abhyankar }
56472b0b2ea7SShri Abhyankar 
56486bce7ff8SHong Zhang #undef __FUNCT__
56494dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
56504dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
56516bce7ff8SHong Zhang {
56526bce7ff8SHong Zhang   Mat            C=B;
56536bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
56546bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
56556bce7ff8SHong Zhang   PetscErrorCode ierr;
56565a586d82SBarry Smith   const PetscInt *r,*ic;
56576bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
56586bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5659b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5660914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5661914a18a2SHong Zhang   MatScalar      *v_work;
5662ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity;
56636bce7ff8SHong Zhang 
56646bce7ff8SHong Zhang   PetscFunctionBegin;
56656bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
56666bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5667ae3d28f0SHong Zhang 
5668fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5669fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
56706bce7ff8SHong Zhang 
5671914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
5672fca92195SBarry Smith   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5673914a18a2SHong Zhang 
56746bce7ff8SHong Zhang   for (i=0; i<n; i++) {
56756bce7ff8SHong Zhang     /* zero rtmp */
56766bce7ff8SHong Zhang     /* L part */
56776bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
56786bce7ff8SHong Zhang     bjtmp = bj + bi[i];
5679914a18a2SHong Zhang     for  (j=0; j<nz; j++) {
5680914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5681914a18a2SHong Zhang     }
56826bce7ff8SHong Zhang 
56836bce7ff8SHong Zhang     /* U part */
56841a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
56851a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
56861a83e813SShri Abhyankar     for  (j=0; j<nz; j++) {
56871a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56881a83e813SShri Abhyankar     }
56891a83e813SShri Abhyankar 
56901a83e813SShri Abhyankar     /* load in initial (unfactored row) */
56911a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
56921a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
56931a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
56941a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
56951a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
56961a83e813SShri Abhyankar     }
56971a83e813SShri Abhyankar 
56981a83e813SShri Abhyankar     /* elimination */
56991a83e813SShri Abhyankar     bjtmp = bj + bi[i];
57001a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
57011a83e813SShri Abhyankar     for (k=0;k < nzL;k++) {
57021a83e813SShri Abhyankar       row = bjtmp[k];
57031a83e813SShri Abhyankar       pc = rtmp + bs2*row;
5704c35f09e5SBarry Smith       for (flg=0,j=0; j<bs2; j++) {
5705c35f09e5SBarry Smith         if (pc[j]!=0.0) {
5706c35f09e5SBarry Smith           flg = 1;
5707c35f09e5SBarry Smith           break;
5708c35f09e5SBarry Smith         }
5709c35f09e5SBarry Smith       }
57101a83e813SShri Abhyankar       if (flg) {
57111a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
571296b95a6bSBarry Smith         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
57131a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
57141a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
57151a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
57161a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
571796b95a6bSBarry Smith           PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
57181a83e813SShri Abhyankar         }
57191a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
57201a83e813SShri Abhyankar       }
57211a83e813SShri Abhyankar     }
57221a83e813SShri Abhyankar 
57231a83e813SShri Abhyankar     /* finished row so stick it into b->a */
57241a83e813SShri Abhyankar     /* L part */
57251a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
57261a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
57271a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
57281a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
57291a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57301a83e813SShri Abhyankar     }
57311a83e813SShri Abhyankar 
57321a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
57331a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
57341a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
5735e32f2f54SBarry Smith     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
57361a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
573796b95a6bSBarry Smith     ierr = PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
57381a83e813SShri Abhyankar 
57391a83e813SShri Abhyankar     /* U part */
57401a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
57411a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
57421a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
57431a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
57441a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57451a83e813SShri Abhyankar     }
57461a83e813SShri Abhyankar   }
57471a83e813SShri Abhyankar 
57481a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5749fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
57501a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
57511a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
57521a83e813SShri Abhyankar 
5753ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5754ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5755ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
5756ae3d28f0SHong Zhang   if (both_identity) {
57574dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5758ae3d28f0SHong Zhang   } else {
57594dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N;
5760ae3d28f0SHong Zhang   }
57614dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5762ae3d28f0SHong Zhang 
57631a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
5764766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
57651a83e813SShri Abhyankar   PetscFunctionReturn(0);
57661a83e813SShri Abhyankar }
57671a83e813SShri Abhyankar 
57686bce7ff8SHong Zhang /*
57696bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
57704dd39f65SShri Abhyankar    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
57714dd39f65SShri Abhyankar    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
57726bce7ff8SHong Zhang */
5773c0c7eb62SShri Abhyankar 
57746bce7ff8SHong Zhang #undef __FUNCT__
57754dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
57764dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
57776bce7ff8SHong Zhang {
57786bce7ff8SHong Zhang 
57796bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
57806bce7ff8SHong Zhang   PetscErrorCode     ierr;
578116a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
578235aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
578335aa4fcfSShri Abhyankar 
578435aa4fcfSShri Abhyankar   PetscFunctionBegin;
578535aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
578635aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
578735aa4fcfSShri Abhyankar 
578835aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
578935aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
579035aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
579135aa4fcfSShri Abhyankar   b->singlemalloc    = PETSC_TRUE;
5792379be0ddSLisandro Dalcin   b->free_a          = PETSC_TRUE;
5793379be0ddSLisandro Dalcin   b->free_ij         = PETSC_TRUE;
57941e40a84eSLisandro Dalcin   fact->preallocated = PETSC_TRUE;
57951e40a84eSLisandro Dalcin   fact->assembled    = PETSC_TRUE;
579635aa4fcfSShri Abhyankar   if (!b->diag) {
579735aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
579835aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
579935aa4fcfSShri Abhyankar   }
580035aa4fcfSShri Abhyankar   bdiag = b->diag;
580135aa4fcfSShri Abhyankar 
580235aa4fcfSShri Abhyankar   if (n > 0) {
580335aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
580435aa4fcfSShri Abhyankar   }
580535aa4fcfSShri Abhyankar 
580635aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
580735aa4fcfSShri Abhyankar   bi = b->i;
580835aa4fcfSShri Abhyankar   bj = b->j;
580935aa4fcfSShri Abhyankar 
581035aa4fcfSShri Abhyankar   /* L part */
581135aa4fcfSShri Abhyankar   bi[0] = 0;
581235aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
581335aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
581435aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
581535aa4fcfSShri Abhyankar     aj = a->j + ai[i];
581635aa4fcfSShri Abhyankar     for (j=0; j<nz; j++) {
581735aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
581835aa4fcfSShri Abhyankar     }
581935aa4fcfSShri Abhyankar   }
582035aa4fcfSShri Abhyankar 
582135aa4fcfSShri Abhyankar   /* U part */
582235aa4fcfSShri Abhyankar   bi_temp = bi[n];
582335aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
582435aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--) {
582535aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
582635aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
582735aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
582835aa4fcfSShri Abhyankar     for (j=0; j<nz; j++) {
582935aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
583035aa4fcfSShri Abhyankar     }
583135aa4fcfSShri Abhyankar     /* diag[i] */
583235aa4fcfSShri Abhyankar     *bj = i; bj++;
583335aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
583435aa4fcfSShri Abhyankar   }
583535aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
583635aa4fcfSShri Abhyankar }
583735aa4fcfSShri Abhyankar 
583835aa4fcfSShri Abhyankar #undef __FUNCT__
58394dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
58404dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
584116a2bf60SHong Zhang {
584216a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
584316a2bf60SHong Zhang   IS                 isicol;
584416a2bf60SHong Zhang   PetscErrorCode     ierr;
584516a2bf60SHong Zhang   const PetscInt     *r,*ic;
58467fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
584716a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
584816a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
584916a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
5850ace3abfcSBarry Smith   PetscBool          col_identity,row_identity,both_identity;
585116a2bf60SHong Zhang   PetscReal          f;
585216a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
585316a2bf60SHong Zhang   PetscBT            lnkbt;
585416a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
585516a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
585616a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5857ace3abfcSBarry Smith   PetscBool          missing;
58587fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
585916a2bf60SHong Zhang 
586016a2bf60SHong Zhang   PetscFunctionBegin;
5861e32f2f54SBarry Smith   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
58626ba06ab7SHong Zhang   if (bs>1) {  /* check shifttype */
58636ba06ab7SHong Zhang     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
58646ba06ab7SHong Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
58656ba06ab7SHong Zhang   }
58666ba06ab7SHong Zhang 
586716a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5868e32f2f54SBarry Smith   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
586916a2bf60SHong Zhang 
587016a2bf60SHong Zhang   f             = info->fill;
587116a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
587216a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
587316a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
587416a2bf60SHong Zhang 
587516a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
587616a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5877ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
587816a2bf60SHong Zhang 
58797fa3a6a0SHong Zhang   if (!levels && both_identity) {
588016a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
58814dd39f65SShri Abhyankar     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
58824dd39f65SShri Abhyankar     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
588335aa4fcfSShri Abhyankar 
5884d5f3da31SBarry Smith     fact->factortype               = MAT_FACTOR_ILU;
588535aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
588635aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
588735aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
588835aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
588935aa4fcfSShri Abhyankar     b->row           = isrow;
589035aa4fcfSShri Abhyankar     b->col           = iscol;
589135aa4fcfSShri Abhyankar     b->icol          = isicol;
589235aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
589335aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
589435aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
589535aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
589635aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
589735aa4fcfSShri Abhyankar   }
589835aa4fcfSShri Abhyankar 
589935aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
590035aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
590135aa4fcfSShri Abhyankar 
590235aa4fcfSShri Abhyankar   /* get new row pointers */
590335aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
590435aa4fcfSShri Abhyankar   bi[0] = 0;
590535aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
590635aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
590735aa4fcfSShri Abhyankar   bdiag[0]  = 0;
590835aa4fcfSShri Abhyankar 
5909fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
591035aa4fcfSShri Abhyankar 
591135aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
591235aa4fcfSShri Abhyankar   nlnk = n + 1;
591335aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
591435aa4fcfSShri Abhyankar 
591535aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
591635aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
591735aa4fcfSShri Abhyankar   current_space = free_space;
591835aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
591935aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
592035aa4fcfSShri Abhyankar 
592135aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
592235aa4fcfSShri Abhyankar     nzi = 0;
592335aa4fcfSShri Abhyankar     /* copy current row into linked list */
592435aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
5925e32f2f54SBarry Smith     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
592635aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
592735aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
592835aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
592935aa4fcfSShri Abhyankar     nzi += nlnk;
593035aa4fcfSShri Abhyankar 
593135aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
593235aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
593335aa4fcfSShri Abhyankar       fm = n;
593435aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
593535aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
593635aa4fcfSShri Abhyankar       lnk[fm]    = i;
593735aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
593835aa4fcfSShri Abhyankar       nzi++; dcount++;
593935aa4fcfSShri Abhyankar     }
594035aa4fcfSShri Abhyankar 
594135aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
594235aa4fcfSShri Abhyankar     nzbd = 0;
594335aa4fcfSShri Abhyankar     prow = lnk[n];
594435aa4fcfSShri Abhyankar     while (prow < i) {
594535aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
594635aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
594735aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
594835aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
594935aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
595035aa4fcfSShri Abhyankar       nzi += nlnk;
595135aa4fcfSShri Abhyankar       prow = lnk[prow];
595235aa4fcfSShri Abhyankar       nzbd++;
595335aa4fcfSShri Abhyankar     }
595435aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
595535aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
595635aa4fcfSShri Abhyankar 
595735aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
595835aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
595935aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
596035aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
596135aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
596235aa4fcfSShri Abhyankar       reallocs++;
596335aa4fcfSShri Abhyankar     }
596435aa4fcfSShri Abhyankar 
596535aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
596635aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
596735aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
596835aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
596935aa4fcfSShri Abhyankar 
597035aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
597165e19b50SBarry Smith     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
597235aa4fcfSShri Abhyankar 
597335aa4fcfSShri Abhyankar     current_space->array           += nzi;
597435aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
597535aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
597635aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
597735aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
597835aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
597935aa4fcfSShri Abhyankar   }
598035aa4fcfSShri Abhyankar 
598135aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
598235aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
598335aa4fcfSShri Abhyankar 
598435aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
59859263d837SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
59862ce24eb6SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
598735aa4fcfSShri Abhyankar 
598835aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
598935aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5990fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
599135aa4fcfSShri Abhyankar 
599235aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
599335aa4fcfSShri Abhyankar   {
5994aef85c9fSShri Abhyankar     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
599535aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
599635aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
599735aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
599835aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
599935aa4fcfSShri Abhyankar     if (diagonal_fill) {
600035aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
600135aa4fcfSShri Abhyankar     }
600235aa4fcfSShri Abhyankar   }
600335aa4fcfSShri Abhyankar #endif
600435aa4fcfSShri Abhyankar 
600535aa4fcfSShri Abhyankar   /* put together the new matrix */
600635aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
600735aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
600835aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
600935aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
601035aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
601135aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
601235aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
601335aa4fcfSShri Abhyankar   b->j          = bj;
601435aa4fcfSShri Abhyankar   b->i          = bi;
601535aa4fcfSShri Abhyankar   b->diag       = bdiag;
601635aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
601735aa4fcfSShri Abhyankar   b->ilen       = 0;
601835aa4fcfSShri Abhyankar   b->imax       = 0;
601935aa4fcfSShri Abhyankar   b->row        = isrow;
602035aa4fcfSShri Abhyankar   b->col        = iscol;
602135aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
602235aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
602335aa4fcfSShri Abhyankar   b->icol       = isicol;
602435aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
602535aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
602635aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
602735aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
602835aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
6029ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
6030ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
6031ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
60324dd39f65SShri Abhyankar   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
603335aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
603435aa4fcfSShri Abhyankar }
603535aa4fcfSShri Abhyankar 
60364e2b4712SSatish Balay /*
60374e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
60384e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
60394e2b4712SSatish Balay    Not a good example of code reuse.
60404e2b4712SSatish Balay */
60414a2ae208SSatish Balay #undef __FUNCT__
604206e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
604306e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
60444e2b4712SSatish Balay {
60454e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
60464e2b4712SSatish Balay   IS             isicol;
60476849ba73SBarry Smith   PetscErrorCode ierr;
60485d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
60495d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6050a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6051d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6052ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity,flg;
6053329f5518SBarry Smith   PetscReal      f;
60544e2b4712SSatish Balay 
60554e2b4712SSatish Balay   PetscFunctionBegin;
60566bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6057e32f2f54SBarry Smith   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
60586bce7ff8SHong Zhang 
6059435faa5fSBarry Smith   f             = info->fill;
6060690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
6061690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
60624c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
606316a2bf60SHong Zhang 
6064667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6065667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6066ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
6067309c388cSBarry Smith 
606841df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
606916a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
60708b1456e3SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
60716bce7ff8SHong Zhang 
6072d5f3da31SBarry Smith     fact->factortype = MAT_FACTOR_ILU;
6073ae3d28f0SHong Zhang     b            = (Mat_SeqBAIJ*)fact->data;
6074bb3d539aSBarry Smith     b->row       = isrow;
6075bb3d539aSBarry Smith     b->col       = iscol;
6076bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6077bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6078bb3d539aSBarry Smith     b->icol      = isicol;
6079bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6080b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
60816bce7ff8SHong Zhang     PetscFunctionReturn(0);
60826bce7ff8SHong Zhang   }
60836bce7ff8SHong Zhang 
60846bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
60854e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
60864e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
60874e2b4712SSatish Balay 
60884e2b4712SSatish Balay     /* get new row pointers */
6089690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
60904e2b4712SSatish Balay     ainew[0] = 0;
60914e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
6092690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
6093690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
60944e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
6095690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
60964e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
6097690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
60984e2b4712SSatish Balay     /* im is level for each filled value */
6099690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
61004e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
6101690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
61024e2b4712SSatish Balay     dloc[0]  = 0;
61034e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
6104435faa5fSBarry Smith 
6105435faa5fSBarry Smith       /* copy prow into linked list */
61064e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6107e32f2f54SBarry Smith       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
61084e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
61094e2b4712SSatish Balay       fill[n]    = n;
6110435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
61114e2b4712SSatish Balay       while (nz--) {
61124e2b4712SSatish Balay         fm  = n;
61134e2b4712SSatish Balay         idx = ic[*xi++];
61144e2b4712SSatish Balay         do {
61154e2b4712SSatish Balay           m  = fm;
61164e2b4712SSatish Balay           fm = fill[m];
61174e2b4712SSatish Balay         } while (fm < idx);
61184e2b4712SSatish Balay         fill[m]   = idx;
61194e2b4712SSatish Balay         fill[idx] = fm;
61204e2b4712SSatish Balay         im[idx]   = 0;
61214e2b4712SSatish Balay       }
6122435faa5fSBarry Smith 
6123435faa5fSBarry Smith       /* make sure diagonal entry is included */
6124435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
6125435faa5fSBarry Smith         fm = n;
6126435faa5fSBarry Smith         while (fill[fm] < prow) fm = fill[fm];
6127435faa5fSBarry Smith         fill[prow] = fill[fm];  /* insert diagonal into linked list */
6128435faa5fSBarry Smith         fill[fm]   = prow;
6129435faa5fSBarry Smith         im[prow]   = 0;
6130435faa5fSBarry Smith         nzf++;
6131335d9088SBarry Smith         dcount++;
6132435faa5fSBarry Smith       }
6133435faa5fSBarry Smith 
61344e2b4712SSatish Balay       nzi = 0;
61354e2b4712SSatish Balay       row = fill[n];
61364e2b4712SSatish Balay       while (row < prow) {
61374e2b4712SSatish Balay         incrlev = im[row] + 1;
61384e2b4712SSatish Balay         nz      = dloc[row];
6139435faa5fSBarry Smith         xi      = ajnew  + ainew[row] + nz + 1;
61404e2b4712SSatish Balay         flev    = ajfill + ainew[row] + nz + 1;
61414e2b4712SSatish Balay         nnz     = ainew[row+1] - ainew[row] - nz - 1;
61424e2b4712SSatish Balay         fm      = row;
61434e2b4712SSatish Balay         while (nnz-- > 0) {
61444e2b4712SSatish Balay           idx = *xi++;
61454e2b4712SSatish Balay           if (*flev + incrlev > levels) {
61464e2b4712SSatish Balay             flev++;
61474e2b4712SSatish Balay             continue;
61484e2b4712SSatish Balay           }
61494e2b4712SSatish Balay           do {
61504e2b4712SSatish Balay             m  = fm;
61514e2b4712SSatish Balay             fm = fill[m];
61524e2b4712SSatish Balay           } while (fm < idx);
61534e2b4712SSatish Balay           if (fm != idx) {
61544e2b4712SSatish Balay             im[idx]   = *flev + incrlev;
61554e2b4712SSatish Balay             fill[m]   = idx;
61564e2b4712SSatish Balay             fill[idx] = fm;
61574e2b4712SSatish Balay             fm        = idx;
61584e2b4712SSatish Balay             nzf++;
6159ecf371e4SBarry Smith           } else {
61604e2b4712SSatish Balay             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
61614e2b4712SSatish Balay           }
61624e2b4712SSatish Balay           flev++;
61634e2b4712SSatish Balay         }
61644e2b4712SSatish Balay         row = fill[row];
61654e2b4712SSatish Balay         nzi++;
61664e2b4712SSatish Balay       }
61674e2b4712SSatish Balay       /* copy new filled row into permanent storage */
61684e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
61694e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
6170ecf371e4SBarry Smith 
6171ecf371e4SBarry Smith         /* estimate how much additional space we will need */
6172ecf371e4SBarry Smith         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6173ecf371e4SBarry Smith         /* just double the memory each time */
6174690b6cddSBarry Smith         PetscInt maxadd = jmax;
6175ecf371e4SBarry Smith         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
61764e2b4712SSatish Balay         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
61774e2b4712SSatish Balay         jmax += maxadd;
6178ecf371e4SBarry Smith 
6179ecf371e4SBarry Smith         /* allocate a longer ajnew and ajfill */
61805d0c19d7SBarry Smith         ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
61815d0c19d7SBarry Smith         ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6182606d414cSSatish Balay         ierr = PetscFree(ajnew);CHKERRQ(ierr);
61835d0c19d7SBarry Smith         ajnew = xitmp;
61845d0c19d7SBarry Smith         ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
61855d0c19d7SBarry Smith         ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6186606d414cSSatish Balay         ierr = PetscFree(ajfill);CHKERRQ(ierr);
61875d0c19d7SBarry Smith         ajfill = xitmp;
6188eb150c5cSKris Buschelman         reallocate++; /* count how many reallocations are needed */
61894e2b4712SSatish Balay       }
61905d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
61914e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
61924e2b4712SSatish Balay       dloc[prow]  = nzi;
61934e2b4712SSatish Balay       fm          = fill[n];
61944e2b4712SSatish Balay       while (nzf--) {
61955d0c19d7SBarry Smith         *xitmp++ = fm;
61964e2b4712SSatish Balay         *flev++ = im[fm];
61974e2b4712SSatish Balay         fm      = fill[fm];
61984e2b4712SSatish Balay       }
6199435faa5fSBarry Smith       /* make sure row has diagonal entry */
6200435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6201e32f2f54SBarry Smith         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
62022401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6203435faa5fSBarry Smith       }
62044e2b4712SSatish Balay     }
6205606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
62064e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
62074e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6208606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
6209606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
62104e2b4712SSatish Balay 
62116cf91177SBarry Smith #if defined(PETSC_USE_INFO)
62124e2b4712SSatish Balay     {
6213329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6214ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6215ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6216ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6217ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6218335d9088SBarry Smith       if (diagonal_fill) {
6219ae15b995SBarry Smith         ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6220335d9088SBarry Smith       }
62214e2b4712SSatish Balay     }
622263ba0a88SBarry Smith #endif
62234e2b4712SSatish Balay 
62244e2b4712SSatish Balay     /* put together the new matrix */
6225719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6226719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6227ae3d28f0SHong Zhang     b    = (Mat_SeqBAIJ*)fact->data;
6228e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
6229e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
62307c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
6231a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
62324e2b4712SSatish Balay     b->j          = ajnew;
62334e2b4712SSatish Balay     b->i          = ainew;
62344e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
62354e2b4712SSatish Balay     b->diag       = dloc;
62367f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
62374e2b4712SSatish Balay     b->ilen       = 0;
62384e2b4712SSatish Balay     b->imax       = 0;
62394e2b4712SSatish Balay     b->row        = isrow;
62404e2b4712SSatish Balay     b->col        = iscol;
6241bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6242c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6243c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6244e51c0b9cSSatish Balay     b->icol       = isicol;
624587828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
62464e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
62474e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
6248719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
62494e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
62504e2b4712SSatish Balay 
6251ae3d28f0SHong Zhang     fact->info.factor_mallocs    = reallocate;
6252ae3d28f0SHong Zhang     fact->info.fill_ratio_given  = f;
6253ae3d28f0SHong Zhang     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
62546bce7ff8SHong Zhang 
62558b1456e3SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
62568661488fSKris Buschelman   PetscFunctionReturn(0);
62578661488fSKris Buschelman }
62588661488fSKris Buschelman 
6259732ee342SKris Buschelman #undef __FUNCT__
62607e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6261dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
62627e7071cdSKris Buschelman {
626312272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
626412272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
62655a9542e3SKris Buschelman   PetscFunctionBegin;
62667cf1b8d3SKris Buschelman   /* Undo Column scaling */
62677cf1b8d3SKris Buschelman /*    while (nz--) { */
62687cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
62697cf1b8d3SKris Buschelman /*    } */
6270c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
6271c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
62727cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
62737cf1b8d3SKris Buschelman }
62747cf1b8d3SKris Buschelman 
62757cf1b8d3SKris Buschelman #undef __FUNCT__
62767cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6277dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
62787cf1b8d3SKris Buschelman {
62797cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6280b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
62812aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
62825a9542e3SKris Buschelman   PetscFunctionBegin;
62830b9da03eSKris Buschelman   /* Is this really necessary? */
628420235379SKris Buschelman   while (nz--) {
62850b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
62867e7071cdSKris Buschelman   }
6287c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
62887e7071cdSKris Buschelman   PetscFunctionReturn(0);
62897e7071cdSKris Buschelman }
62907e7071cdSKris Buschelman 
6291732ee342SKris Buschelman 
6292